Hello everybody,
I have analysed an experiment of ribodepleted samples using both DESeq2 and edgeR robust. I read that one can expect a concordance of 70-80% between both tools. Here, this is not the case.
In this experiment, there are 3 conditions : cell lines of patient with one specific mutation (6 samples), cell lines of patient with another specific mutation (6 samples) and cell lines of controls (3 samples).
I attached the comparison of edgeR and DESeq2 for:
1/ cell lines of patient (12 samples) vs cell lines of controls (3 samples): http://i.imgur.com/gYWJ26b.png?1
2/ cell lines of patient with one specific mutation (6 samples) vs cell lines of patient with another specific mutation (6 samples): http://i.imgur.com/7JClh40.png?1
In both comparisons, I compare all the deregulated genes (top panel), genes with padj<=0.001 (middle panel) and genes with at least an average of 100 reads in one of the conditions (bottom panel). I wanted to see if deregulated genes found by only one tool were enriched in low significant or lowly expressed genes, which does not seem to be the case.
In the first comparison, DEseq2 finds more up-regulated genes, whereas edgeR finds more down-regulated genes.
In the second comparison, barely one third of the deregulated genes are found by both methods.
Any idea for these discrepancies? What do you think about these results?
I can provide the code I used for more details.
Thank you in advance for your help,
Jane

Put some code in your post to show how you ran each of the two analyses.
Sure, here is the DESeq2 code for the first analysis:
library("DESeq2") ##################################### ### Creating CountDataSet object ### ##################################### Nom=c("PatientCellLine_ARN_COM","PatientCellLine_ARN_GAN","PatientCellLine_ARN_GEN","PatientCellLine_ARN_GRA","PatientCellLine_ARN_KOR","PatientCellLine_ARN_LEV","PatientCellLine_ARN_PAR","PatientCellLine_ARN_POP","PatientCellLine_ARN_RIP","PatientCellLine_ARN_SAU","PatientCellLine_ARN_SEG","PatientCellLine_ARN_SEV","Ctrl3_ARN_3","Ctr_ARN_2","Ctr_ARN_3") Status=c("PatientCellLine", "PatientCellLine","PatientCellLine", "PatientCellLine", "PatientCellLine","PatientCellLine", "PatientCellLine", "PatientCellLine", "PatientCellLine","PatientCellLine", "PatientCellLine", "PatientCellLine", "Ctrl", "Ctrl", "Ctrl") Fichier = paste(Nom,"gencod.htseqCount",sep=".") DataFrame=data.frame(Nom,Fichier,Status) ################################################ ### Model ### ### Estimation of sizeFactors and Dispersion ### ################################################ dds_raw=DESeqDataSetFromHTSeqCount(DataFrame,"/home/PatientCellLine_ARN/Results/Data/17052016", ~Status) # 63568 colData(dds_raw)$Status = factor(colData(dds_raw)$Status, levels=c("Ctrl","PatientCellLine")) colData(dds_raw)$Status = relevel(colData(dds_raw)$Status, "Ctrl") write.table(counts(dds_raw), file="PatientCellLine_vs_3Ctrl_DESeq2_RawTableFull",quote=FALSE,row.names=TRUE, col.names=TRUE,sep="\t") length( which( rowSums(counts(dds_raw)) ==0 ) ) #22671 dds <- dds_raw[ rowSums(counts(dds_raw)) > 1, ] #37227 dds=estimateSizeFactors(dds) dds=estimateDispersions(dds) ##################################### ### Differential expression Tests ### ##################################### dds=nbinomWaldTest(dds) ddsFinal=dds resultsNames(ddsFinal) "Intercept" "StatusCtrl" "StatusPatientCellLine" res1=results(ddsFinal,contrast=c(0,-1,1),pAdjustMethod="BH", cooksCutoff=FALSE, independentFiltering=TRUE,alpha=0.01) sum(res1$padj < 0.01, na.rm=TRUE) summary(res1) resNoFilt=results(ddsFinal,contrast=c(0,-1,1),pAdjustMethod="BH", cooksCutoff=FALSE, independentFiltering=FALSE,alpha=0.01) table(filtering=(res1$padj<0.01), noFiltering=(resNoFilt$padj<0.01)) res1$FoldChange=2^(res1$log2FoldChange) res1$MeanPatientCellLine <- rowMeans(counts(ddsFinal,normalized=TRUE)[,c(1:12)]) res1$MeanCtrl <- rowMeans(counts(ddsFinal,normalized=TRUE)[,c(13:15)]) Max=pmax(res1$MeanCtrl,res1$MeanPatientCellLine) ChosenThreshold=100 ind_lowexpressed=which(as.vector(Max)<ChosenThreshold) ### Differential genes ### res_final=res1 padj=0.01 indSigIncrease=order(res1$padj) Sig= which(res1$padj[indSigIncrease] <= padj) indSig=indSigIncrease[Sig] resSig=res1[indSig,] dim(resSig) #695 #most strongly down regulated genes ind_down=which(resSig$log2FoldChange<=0) resSigDown=resSig[ind_down,] dim(resSigDown) #252 MaxDown=pmax(resSigDown$MeanCtrl,resSigDown$MeanPatientCellLine) ind_down_below100=which(as.vector(MaxDown)<ChosenThreshold) length(ind_down_below100) #79 #most strongly up regulated genes ind_up=which(resSig$log2FoldChange>=0) resSigUp=resSig[ind_up,] dim(resSigUp) #443 MaxUp=pmax(resSigUp$MeanCtrl,resSigUp$MeanPatientCellLine) ind_up_below100=which(as.vector(MaxUp)<ChosenThreshold) length(ind_up_below100) #150And the edgeR code for the first analysis:
library("edgeR") library("gplots") rawdata <- read.delim("PatientCellLine_vs_3Ctrl_DESeq2_RawTableFull",header=TRUE, stringsAsFactors=FALSE,row.names="Gene" ) Status=c("PatientCellLine", "PatientCellLine","PatientCellLine", "PatientCellLine", "PatientCellLine","PatientCellLine", "PatientCellLine", "PatientCellLine", "PatientCellLine","PatientCellLine", "PatientCellLine", "PatientCellLine", "Ctrl", "Ctrl", "Ctrl") ################# # design matrix # ################# design <- model.matrix(~Status) rownames(design) <- colnames(rawdata) y <- DGEList(counts=rawdata, group=Status) keep <- rowSums(cpm(y)>1) >= 1 y <- y[keep,] dim(y) #17904 ################################################### # Differential expression & Dispersion estimation # ################################################### y <- calcNormFactors(y) scale <- y$samples$lib.size* y$samples$norm.factors normCounts <- round(t(t(y$counts)/scale)*mean(scale)) #Normalized counts y$normCounts=normCounts y <- estimateGLMRobustDisp(y,design) fit <- glmFit(y,design) lrt <- glmLRT(fit,coef=ncol(fit$design)) summary(de <- decideTestsDGE(lrt,p.value=0.01,lfc=0)) detags <- rownames(y)[as.logical(de)] table= topTags(lrt,n=nrow(y),p.value=1,sort.by="none")$table table$baseMean=rowMeans(normCounts) table$MeanPatientCellLine=rowMeans(normCounts[,1:12]) table$MeanCtrl=rowMeans(normCounts[,13:15]) padj=0.01 ChosenThreshold=100 indSigIncrease=order(table$FDR) Sig= which(table$FDR[indSigIncrease] <= padj) indSig=indSigIncrease[Sig] resSig=table[indSig,] dim(resSig) #702 #most strongly down regulated genes ind_down=which(resSig$logFC<=0) resSigDown=resSig[ind_down,] dim(resSigDown) #445 MaxDown=pmax(resSigDown$MeanPatientCellLine,resSigDown$MeanCtrl) ind_down_below100=which(as.vector(MaxDown)<ChosenThreshold) length(ind_down_below100) #95 #most strongly up regulated genes ind_up=which(resSig$logFC>=0) resSigUp=resSig[ind_up,] dim(resSigUp) #257 MaxUp=pmax(resSigUp$MeanPatientCellLine,resSigUp$MeanCtrl) ind_up_below100=which(as.vector(MaxUp)<ChosenThreshold) length(ind_up_below100) #54Here is the DESeq2 code for analysis 2:
library("DESeq2") ##################################### ### Creating CountDataSet object ### ##################################### Nom=c("PatientCellLine_ARN_COM","PatientCellLine_ARN_GAN","PatientCellLine_ARN_GEN","PatientCellLine_ARN_LEV","PatientCellLine_ARN_PAR","PatientCellLine_ARN_SAU","PatientCellLine_ARN_GRA","PatientCellLine_ARN_KOR","PatientCellLine_ARN_POP","PatientCellLine_ARN_RIP", "PatientCellLine_ARN_SEG","PatientCellLine_ARN_SEV") MUT=c("Mut1", "Mut1","Mut1", "Mut1", "Mut1","Mut1", "Mut2", "Mut2","Mut2", "Mut2","Mut2", "Mut2") Fichier = paste(Nom,"gencod.htseqCount",sep=".") DataFrame=data.frame(Nom,Fichier,MUT) ################################################ ### Modèle ### ### Estimation of sizeFactors and Dispersion ### ################################################ dds_raw=DESeqDataSetFromHTSeqCount(DataFrame,"/home/PatientCellLine_ARN/Results/Data/17052016", ~MUT) colData(dds_raw)$MUT = factor(colData(dds_raw)$MUT, levels=c("Mut2", "Mut1")) colData(dds_raw)$MUT = relevel(colData(dds_raw)$MUT, "Mut2") write.table(counts(dds_raw),file="PatientCellLine_Mut_DESeq2_RawTableFull",quote=FALSE,row.names=TRUE, col.names=TRUE,sep="\t") sum( rowSums(counts(dds_raw)) ==0) #23622 dds <- dds_raw[ rowSums(counts(dds_raw)) > 1, ] #36179 dds=DESeq(dds, minReplicatesForReplace=6) ##################################### ### Differential expression Tests ### ##################################### ddsFinal=dds resultsNames(ddsFinal) "Intercept" "Mut2" "Mut1" res1=results(ddsFinal,contrast=c(0,1,-1),pAdjustMethod="BH", cooksCutoff=TRUE, independentFiltering=TRUE,alpha=0.01) sum(res1$padj < 0.01, na.rm=TRUE) resNoFilt=results(ddsFinal,contrast=c(0,-1,1),pAdjustMethod="BH", cooksCutoff=TRUE, independentFiltering=FALSE,alpha=0.01) table(filtering=(res1$padj<0.01), noFiltering=(resNoFilt$padj<0.01)) res1$FoldChange=2^(res1$log2FoldChange) res1$MeanMut1 <- rowMeans(counts(ddsFinal,normalized=TRUE)[,1:6]) res1$MeanMut2 <- rowMeans(counts(ddsFinal,normalized=TRUE)[,6:12]) Max=pmax(res1$MeanMut1,res1$MeanMut2) ChosenThreshold=100 ### Differential genes ### res_final=res1 padj=0.01 indSigIncrease=order(res1$padj) Sig= which(res1$padj[indSigIncrease] <= padj) indSig=indSigIncrease[Sig] resSig=res1[indSig,] dim(resSig) #145 #most strongly down regulated genes ind_down=which(resSig$log2FoldChange<=0) resSigDown=resSig[ind_down,] dim(resSigDown) #65 MaxDown=pmax(resSigDown$MeanMut1,resSigDown$MeanMut2) ind_down_below100=which(as.vector(MaxDown)<ChosenThreshold) length(ind_down_below100) #21 #most strongly up regulated genes ind_up=which(resSig$log2FoldChange>=0) resSigUp=resSig[ind_up,] dim(resSigUp) #80 MaxUp=pmax(resSigUp$MeanMut1,resSigUp$MeanMut2) ind_up_below100=which(as.vector(MaxUp)<ChosenThreshold) length(ind_up_below100) #22Sorry for the length...
Finally, the edgeR code for analysis 2:
library("edgeR") rawdata <- read.delim("PatientCellLine_MUT_DESeq2_RawTableFull",row.names="Gene",header=TRUE, stringsAsFactors=FALSE) MUT= factor(c("Mut1", "Mut1","Mut1","Mut1", "Mut1","Mut1", "Mut2", "Mut2", "Mut2", "Mut2", "Mut2", "Mut2")) ################# # design matrix # ################# design <- model.matrix(~MUT) rownames(design) <- colnames(rawdata) y <- DGEList(counts=rawdata, group=MUT) keep <- rowSums(cpm(y)>1) >= 1 y <- y[keep,] dim(y) # 17556 ########################## # Differential expression & Dispersion estimation ########################## y <- calcNormFactors(y) scale <- y$samples$lib.size* y$samples$norm.factors normCounts <- round(t(t(y$counts)/scale)*mean(scale)) y$normCounts=normCounts y <- estimateGLMRobustDisp(y,design) fit <- glmFit(y,design) lrt <- glmLRT(fit,coef=2) summary(de <- decideTestsDGE(lrt,p.value=0.01, lfc=0)) detags <- rownames(y)[as.logical(de)] table= topTags(lrt,n=nrow(y),p.value=1,sort.by="none")$table table$baseMean=rowMeans(normCounts) table$MeanMut1=rowMeans(normCounts[,1:6]) table$MeanMut2=rowMeans(normCounts[,7:12]) padj=0.01 ChosenThreshold=100 indSigIncrease=order(table$FDR) Sig= which(table$FDR[indSigIncrease] <= padj) indSig=indSigIncrease[Sig] resSig=table[indSig,] dim(resSig) #210 #most strongly down regulated genes ind_down=which(resSig$logFC<=0) resSigDown=resSig[ind_down,] dim(resSigDown) #73 MaxDown=pmax(resSigDown$MeanMut1,resSigDown$MeanMut2) ind_down_below100=which(as.vector(MaxDown)<ChosenThreshold) length(ind_down_below100) #19 #most strongly up regulated genes ind_up=which(resSig$logFC>=0) resSigUp=resSig[ind_up,] dim(resSigUp) #137 MaxUp=pmax(resSigUp$MeanMut1,resSigUp$MeanMut2) ind_up_below100=which(as.vector(MaxUp)<ChosenThreshold) length(ind_up_below100) #20