Hi all,
I have a dataset and people who worked with that dataset used Deseq and now I want to use Deseq2 and compare the results. However I am a bit confused. I have RNA seq and ribosome profiling data from same tissue. How I can use Deseq2 for ribosome profiling data. Do I need to add some normalisation steps to combine RNA-seq and ribosome profiling data?
> exprTable <- data.frame(L061ribo=L061_ribo$V3, L062ribo=L062_ribo$V3, L063ribo=L063_ribo$V3, + L241ribo=L241_ribo$V3, L242ribo=L242_ribo$V3, L243ribo=L243_ribo$V3, + L061mrna=L061_mRNA$V3, L062mrna=L062_mRNA$V3, L063mrna=L063_mRNA$V3, + L241mrna=L241_mRNA$V3, L242mrna=L242_mRNA$V3, L243mrna=L243_mRNA$V3, + row.names=L061_mRNA$V1) > > allSamples = c("L061mrna","L061ribo", "L062mrna", "L062ribo", "L063mrna", + "L063ribo", "L241mrna", "L241ribo", "L242mrna", "L242ribo", + "L243mrna", "L243ribo") > > mrnaSamples = c("L061mrna", "L062mrna", "L063mrna", "L241mrna", "L242mrna", "L243mrna") > > conditions<-data.frame( + biol =factor(c("mrna", "ribo", "mrna", "ribo","mrna", "ribo", "mrna", "ribo", "mrna", "ribo","mrna", "ribo")), + age =factor(c("06", "06","06", "24", "24", "24","06", "06","06", "24", "24", "24")), + row.names=allSamples + ) > > > #1. countData # read counts for all the samples > #2. SampleTable # sample names and their condition status > exprAll<-exprTable[,allSamples] > means<-rowMeans(exprAll[,mrnaSamples]) > countData<-exprAll[means>=100,] > head(countData) L061mrna L061ribo L062mrna L062ribo L063mrna L063ribo L241mrna L241ribo L242mrna L242ribo L243mrna ENSRNOT00000045657 910 228 904 340 890 422 1005 208 918 220 1166 ENSRNOT00000061299 920 757 1635 1503 1501 1956 1392 719 1273 1036 1620 ENSRNOT00000057034 1436 1234 1392 1400 1485 1753 1895 986 1855 1309 2335 ENSRNOT00000047071 189 327 169 510 172 446 266 263 214 301 535 ENSRNOT00000005653 325 296 332 443 322 503 414 309 341 307 412 ENSRNOT00000025255 478 236 380 239 349 370 589 280 509 383 580 L243ribo ENSRNOT00000045657 344 ENSRNOT00000061299 1174 ENSRNOT00000057034 1803 ENSRNOT00000047071 569 ENSRNOT00000005653 448 ENSRNOT00000025255 501 > ``` Error: attempt to use zero-length variable name > > ```{r DeSeq2 analysis} Error: attempt to use zero-length variable name > dds <- DESeqDataSetFromMatrix(countData = countData, colData = conditions, design = ~ biol) > dds <- DESeq(dds) estimating size factors estimating dispersions gene-wise dispersion estimates mean-dispersion relationship final dispersion estimates fitting model and testing > result_age <- results(dds) ## contrast specifies conditions to be tested > res<- result_age[complete.cases(result_age),] ## to remove rows with NA > mcols(res, use.names=TRUE) DataFrame with 6 rows and 2 columns type description <character> <character> baseMean intermediate mean of normalized counts for all samples log2FoldChange results log2 fold change (MAP): biol ribo vs mrna lfcSE results standard error: biol ribo vs mrna stat results Wald statistic: biol ribo vs mrna pvalue results Wald test p-value: biol ribo vs mrna padj results BH adjusted p-values > summary(res) out of 8908 with nonzero total read count adjusted p-value < 0.1 LFC > 0 (up) : 3391, 38% LFC < 0 (down) : 3330, 37% outliers [1] : 0, 0% low counts [2] : 0, 0% (mean count < 35) [1] see 'cooksCutoff' argument of ?results [2] see 'independentFiltering' argument of ?results > #################################################################################### > #lower the false discovery rate threshold (the threshold on padj in the results table) > #raise the log2 fold change threshold from 0 using the lfcThreshold argument of results > res.05 <- results(dds, alpha=.05) > table(res.05$padj < .05) FALSE TRUE 2526 6382 > > # genes that show significant effects of treatment on gene counts more than doubling or less than halving > resLFC1 <- results(dds, lfcThreshold=1) > table(resLFC1$padj < 0.1) FALSE TRUE 8003 905 > > #Multiple testing > sum(res$pvalue < 0.05, na.rm=TRUE) [1] 6534 > sum(!is.na(res$pvalue)) [1] 8908 > > sum(res$padj < 0.1, na.rm=TRUE) [1] 6721 > > #subset the results table to these genes and then sort it by the log2 fold change estimate to get the significant genes with the strongest down-regulation: > resSig <- subset(res, padj < 0.1) > head(resSig[ order(resSig$log2FoldChange), ]) log2 fold change (MAP): biol ribo vs mrna Wald test p-value: biol ribo vs mrna DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue padj <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> ENSRNOT00000022122 375.8005 -8.617070 0.4611512 -18.68600 6.436621e-78 9.885762e-76 ENSRNOT00000041959 221.1546 -8.029477 0.4685828 -17.13566 8.043061e-66 7.541851e-64 ENSRNOT00000048151 202.3702 -7.880453 0.4742099 -16.61807 5.156761e-62 4.029511e-60 ENSRNOT00000008094 389.2988 -7.872536 0.3874021 -20.32136 8.324137e-92 2.180924e-89 ENSRNOT00000037432 270.4202 -7.456887 0.3836682 -19.43577 3.846162e-84 7.704694e-82 ENSRNOT00000050722 148.3404 -7.266285 0.5013328 -14.49393 1.323404e-47 5.953983e-46 > > #significant genes with the strongest up-regulation: > head(resSig[ order(resSig$log2FoldChange, decreasing=TRUE), ]) log2 fold change (MAP): biol ribo vs mrna Wald test p-value: biol ribo vs mrna DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue padj <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> ENSRNOT00000050750 1376.2906 4.755635 0.18104828 26.26722 4.544604e-152 5.783333e-149 ENSRNOT00000034797 1358.0869 4.671882 0.22111888 21.12837 4.363183e-99 1.439527e-96 ENSRNOT00000065232 705.1629 4.053485 0.26129942 15.51280 2.842252e-54 1.622999e-52 ENSRNOT00000056004 1128.9684 3.453405 0.09453835 36.52914 3.823170e-292 3.405679e-288 ENSRNOT00000063903 1383.1973 3.443192 0.11967516 28.77115 4.925776e-182 7.313135e-179 ENSRNOT00000058371 1128.5519 3.424241 0.10957817 31.24930 2.281797e-214 5.081561e-211