Hi all,
I have a dataset and people who worked with that dataset used Deseq and now I want to use Deseq2 and compare the results. However I am a bit confused. I have RNA seq and ribosome profiling data from same tissue. How I can use Deseq2 for ribosome profiling data. Do I need to add some normalisation steps to combine RNA-seq and ribosome profiling data?
> exprTable <- data.frame(L061ribo=L061_ribo$V3, L062ribo=L062_ribo$V3, L063ribo=L063_ribo$V3,
+ L241ribo=L241_ribo$V3, L242ribo=L242_ribo$V3, L243ribo=L243_ribo$V3,
+ L061mrna=L061_mRNA$V3, L062mrna=L062_mRNA$V3, L063mrna=L063_mRNA$V3,
+ L241mrna=L241_mRNA$V3, L242mrna=L242_mRNA$V3, L243mrna=L243_mRNA$V3,
+ row.names=L061_mRNA$V1)
>
> allSamples = c("L061mrna","L061ribo", "L062mrna", "L062ribo", "L063mrna",
+ "L063ribo", "L241mrna", "L241ribo", "L242mrna", "L242ribo",
+ "L243mrna", "L243ribo")
>
> mrnaSamples = c("L061mrna", "L062mrna", "L063mrna", "L241mrna", "L242mrna", "L243mrna")
>
> conditions<-data.frame(
+ biol =factor(c("mrna", "ribo", "mrna", "ribo","mrna", "ribo", "mrna", "ribo", "mrna", "ribo","mrna", "ribo")),
+ age =factor(c("06", "06","06", "24", "24", "24","06", "06","06", "24", "24", "24")),
+ row.names=allSamples
+ )
>
>
> #1. countData # read counts for all the samples
> #2. SampleTable # sample names and their condition status
> exprAll<-exprTable[,allSamples]
> means<-rowMeans(exprAll[,mrnaSamples])
> countData<-exprAll[means>=100,]
> head(countData)
L061mrna L061ribo L062mrna L062ribo L063mrna L063ribo L241mrna L241ribo L242mrna L242ribo L243mrna
ENSRNOT00000045657 910 228 904 340 890 422 1005 208 918 220 1166
ENSRNOT00000061299 920 757 1635 1503 1501 1956 1392 719 1273 1036 1620
ENSRNOT00000057034 1436 1234 1392 1400 1485 1753 1895 986 1855 1309 2335
ENSRNOT00000047071 189 327 169 510 172 446 266 263 214 301 535
ENSRNOT00000005653 325 296 332 443 322 503 414 309 341 307 412
ENSRNOT00000025255 478 236 380 239 349 370 589 280 509 383 580
L243ribo
ENSRNOT00000045657 344
ENSRNOT00000061299 1174
ENSRNOT00000057034 1803
ENSRNOT00000047071 569
ENSRNOT00000005653 448
ENSRNOT00000025255 501
> ```
Error: attempt to use zero-length variable name
>
> ```{r DeSeq2 analysis}
Error: attempt to use zero-length variable name
> dds <- DESeqDataSetFromMatrix(countData = countData, colData = conditions, design = ~ biol)
> dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
> result_age <- results(dds) ## contrast specifies conditions to be tested
> res<- result_age[complete.cases(result_age),] ## to remove rows with NA
> mcols(res, use.names=TRUE)
DataFrame with 6 rows and 2 columns
type description
<character> <character>
baseMean intermediate mean of normalized counts for all samples
log2FoldChange results log2 fold change (MAP): biol ribo vs mrna
lfcSE results standard error: biol ribo vs mrna
stat results Wald statistic: biol ribo vs mrna
pvalue results Wald test p-value: biol ribo vs mrna
padj results BH adjusted p-values
> summary(res)
out of 8908 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 3391, 38%
LFC < 0 (down) : 3330, 37%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 35)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
> ####################################################################################
> #lower the false discovery rate threshold (the threshold on padj in the results table)
> #raise the log2 fold change threshold from 0 using the lfcThreshold argument of results
> res.05 <- results(dds, alpha=.05)
> table(res.05$padj < .05)
FALSE TRUE
2526 6382
>
> # genes that show significant effects of treatment on gene counts more than doubling or less than halving
> resLFC1 <- results(dds, lfcThreshold=1)
> table(resLFC1$padj < 0.1)
FALSE TRUE
8003 905
>
> #Multiple testing
> sum(res$pvalue < 0.05, na.rm=TRUE)
[1] 6534
> sum(!is.na(res$pvalue))
[1] 8908
>
> sum(res$padj < 0.1, na.rm=TRUE)
[1] 6721
>
> #subset the results table to these genes and then sort it by the log2 fold change estimate to get the significant genes with the strongest down-regulation:
> resSig <- subset(res, padj < 0.1)
> head(resSig[ order(resSig$log2FoldChange), ])
log2 fold change (MAP): biol ribo vs mrna
Wald test p-value: biol ribo vs mrna
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSRNOT00000022122 375.8005 -8.617070 0.4611512 -18.68600 6.436621e-78 9.885762e-76
ENSRNOT00000041959 221.1546 -8.029477 0.4685828 -17.13566 8.043061e-66 7.541851e-64
ENSRNOT00000048151 202.3702 -7.880453 0.4742099 -16.61807 5.156761e-62 4.029511e-60
ENSRNOT00000008094 389.2988 -7.872536 0.3874021 -20.32136 8.324137e-92 2.180924e-89
ENSRNOT00000037432 270.4202 -7.456887 0.3836682 -19.43577 3.846162e-84 7.704694e-82
ENSRNOT00000050722 148.3404 -7.266285 0.5013328 -14.49393 1.323404e-47 5.953983e-46
>
> #significant genes with the strongest up-regulation:
> head(resSig[ order(resSig$log2FoldChange, decreasing=TRUE), ])
log2 fold change (MAP): biol ribo vs mrna
Wald test p-value: biol ribo vs mrna
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSRNOT00000050750 1376.2906 4.755635 0.18104828 26.26722 4.544604e-152 5.783333e-149
ENSRNOT00000034797 1358.0869 4.671882 0.22111888 21.12837 4.363183e-99 1.439527e-96
ENSRNOT00000065232 705.1629 4.053485 0.26129942 15.51280 2.842252e-54 1.622999e-52
ENSRNOT00000056004 1128.9684 3.453405 0.09453835 36.52914 3.823170e-292 3.405679e-288
ENSRNOT00000063903 1383.1973 3.443192 0.11967516 28.77115 4.925776e-182 7.313135e-179
ENSRNOT00000058371 1128.5519 3.424241 0.10957817 31.24930 2.281797e-214 5.081561e-211
