Hello,
I have counts data with 366 samples as columns and more than 50k ensembl ids as rownames. The data is stored in a dataframe "U".
Data looks like following:
S1 S2 S3
ENSG00000000003.13 5745 8742 7257
ENSG00000000005.5 4 1 0
ENSG00000000419.11 1394 2211 615
ENSG00000000457.12 344 1166 692
ENSG00000000460.15 405 480 258
ENSG00000000938.11 144 132 110
table(df$Status)
F2 F3 F4 NE
125 220 20 1
Initially I have used limma, Glimma, edgeR for differential analysis which is mentioned in this link (http://combine-australia.github.io/RNAseq-R/06-rnaseq-day1.html) I followed the same steps with my data.
library(limma)
library(Glimma)
library(edgeR)
mycpm <- cpm(U)
head(mycpm)
thresh <- mycpm > 0.5
head(thresh)
keep <- rowSums(thresh) >= 2
table(keep)
summary(keep)
counts.keep <- U[keep,]
dim(counts.keep)
y <- DGEList(counts.keep)
y
y <- calcNormFactors(y,method = "TMM")
group <- factor(paste0(df$Status))
design2 <- model.matrix(~ 0 + group)
contrast.matrix <- makeContrasts(F2-F3, levels=design2)
contrast.matrix
Contrasts
Levels F2 - F3
F2 1
F3 -1
F4 0
NE 0
v <- voom(y, design2, plot=TRUE)
fit <- lmFit(v, design2)
fitC <- contrasts.fit(fit, contrast.matrix)
fitC <- eBayes(fitC)
dim(fitC)
tab <- topTable(fitC,adjust="BH",n=Inf)
head(tab)
logFC AveExpr t P.Value adj.P.Val B
ENSG00000129749.3 0.4809976 -2.100567 5.158856 4.079325e-07 0.01019219 4.776599
ENSG00000121274.11 -0.3031990 3.152421 -4.356722 1.716605e-05 0.10013460 2.675503
ENSG00000185973.9 -0.2884105 2.409274 -4.247311 2.746711e-05 0.10013460 2.212433
ENSG00000176170.12 1.0325463 1.930914 4.233827 2.908560e-05 0.10013460 2.157469
ENSG00000132744.6 -0.8300652 3.152324 -4.218580 3.102526e-05 0.10013460 2.146414
ENSG00000172943.17 -0.3884991 4.481074 -4.205040 3.285071e-05 0.10013460 2.125056
summary(decideTests(fitC))
F2 - F3
-1 0
0 24984
1 1
So with this I found only one differentially expressed gene (based on adj.P.Val < 0.05)
################################################################################################
I also tried using edgeR with the same data. I used the same code mentioned in edgeR tutorial for differential expression with featurecounts data.
library(edgeR)
group <- factor(paste0(df$Status))
y <- DGEList(U, group=group)
keep <- rowSums(cpm(y) > 0.5) >= 2
table(keep)
summary(keep)
Mode FALSE TRUE NA's
logical 35503 24985 0
y <- y[keep, , keep.lib.sizes=FALSE]
y <- calcNormFactors(y, method = "TMM")
design2 <- model.matrix(~ 0 + group)
colnames(design2) <- c("F2","F3","F4","NE")
design2
y <- estimateDisp(y, design2, robust=TRUE)
y$common.dispersion
[1] 1.030704
fit <- glmQLFit(y, design2, robust=TRUE)
head(fit$coefficients)
F2 F3 F4 NE
ENSG00000000003.13 -9.614857 -9.514155 -9.219875 -9.388728
ENSG00000000005.5 -17.008674 -16.908788 -16.166986 -16.630297
ENSG00000000419.11 -10.940426 -10.914407 -10.898501 -10.804819
ENSG00000000457.12 -11.582309 -11.493746 -11.423292 -12.203859
ENSG00000000460.15 -12.362215 -12.205290 -11.947569 -12.040664
ENSG00000000938.11 -12.514476 -12.640490 -13.160899 -13.074225
contrast.matrix <- makeContrasts(F2-F3, levels=design2)
contrast.matrix
Contrasts
Levels F2 - F3
F2 1
F3 -1
F4 0
NE 0
qlf <- glmQLFTest(fit, contrast=contrast.matrix)
topTags(qlf)
Coefficient: 1*F2 -1*F3
logFC logCPM F PValue FDR
ENSG00000124232.9 7.351170 0.86857907 278.9784 6.083121e-47 1.519868e-42
ENSG00000102195.8 7.077038 -0.11326609 236.7369 1.532441e-41 1.914402e-37
ENSG00000165066.12 6.034396 -1.42387186 224.2053 7.269869e-40 6.054590e-36
ENSG00000138161.11 3.629804 0.03880203 210.2262 5.937722e-38 3.242534e-34
ENSG00000164434.10 5.533205 -0.92141498 209.9531 6.488962e-38 3.242534e-34
ENSG00000256463.7 5.278677 -2.52333685 201.0082 1.152294e-36 4.798346e-33
ENSG00000233887.1 4.054302 -4.49118571 200.4813 3.120264e-36 1.113711e-32
ENSG00000196431.3 4.767828 -1.79054718 192.6636 1.759004e-35 5.173259e-32
ENSG00000280109.1 5.872673 0.25250281 192.4883 1.863491e-35 5.173259e-32
ENSG00000180053.7 3.349894 -4.85962856 193.4720 2.948655e-35 7.367214e-32
summary(decideTestsDGE(qlf))
[,1]
-1 949
0 22517
1 1519
tr <- glmTreat(fit, contrast=contrast.matrix, lfc=log2(1.2))
tab2 <- topTags(tr,n=Inf)
keep <- tab2$table$FDR <= 0.05
summary(decideTestsDGE(tr))
[,1]
-1 537
0 23545
1 903
So from this 1440 (537 downregulated and 903 upregulated) genes are detected as Differentially expressed genes with fold-change significantly above 1.2 at an FDR cut-off of 5%.
I would like to know why there is lot of difference between these two ways for differential analysis?
In the first way (limma,Glimma,edgeR) I found only one differentially expressed gene. In this process data will be transformed into Voom. Is it because of that? From few days back I'm trying in this way but couldn't find more than 1 differentially expressed gene.
But now I used edgeR which is mentioned in tutorial from feature counts. I used above code and got around 1440 DE genes.
Can anyone clear my doubt? Really appreciate your kind help.
Thank you
Venkatesh
I'd say this is definitely an unexpected result. limma and edgeR generally have pretty good agreement on which genes are significant, and even when they don't, they should still give similar logFC and logCPM/AveExpr values. The fact that edgeR is reporting logFC values as high as 7 and limma is only reporting 0.48 makes me think that something is wrong in one or the other. I don't see any obvious mistakes in your code, so I'm not sure where the discrepancy has occurred, but I would advise you to run through your limma and edgeR code and check each step to make sure it matches your expectations. Perhaps The log transform was somehow run twice on the limma data, or something like that.