Dear All,
I am doing an analysis on human primary cell samples +/- treatment. Due shortage of cells we constructed the library from pg RNA as starting material plus the sequencing returned very low counts for most of genes.
I extracted the protein coding genes from the matrix . However, the default method for size factor estimation returns NULL. At the begging I thought it is because my median counts is 0 so geom mean would not work, but after I filtered out genes (more than 7000) that had counts across all samples it still gives NULL.
I tried other methods suggested and iterate works well; I know the " "iterate" estimator iterates between estimating the dispersion with a design of ~1, and finding a size factor vector by numerically optimizing the likelihood of the ~1 model" , but I do not think that I clearly understand what exactly it does to my data.
In addition I get very high dispersion estimates.
> dds<- DESeqDataSetFromTximport(txi_simplesum, sampleTable, ~ 1)
using counts and average transcript lengths from tximport
>
> dds.protein.coding <- dds[rownames(dds) %in% Sample_coding.v]
> dds.protein.coding
class: DESeqDataSet
dim: 18762 94
metadata(1): version
assays(2): counts avgTxLength
rownames(18762): ENSG00000000419 ENSG00000000457 ... ENSG00000280267 ENSG00000280433
rowData names(0):
colnames(94): 0406_S18_S48_L008 0406_S19_S26_L005 ... 2105S2_S30 3007S6_S29
colData names(18): Sampl.name RUN ... SAMPLE.TYPE Condition
> Blood.CD19.coding <- dds.protein.coding[ ,dds.protein.coding$Cell.type %in% "BLOOD.CD19"]
> Blood.CD19.coding
class: DESeqDataSet
dim: 18762 26
metadata(1): version
assays(2): counts avgTxLength
rownames(18762): ENSG00000000419 ENSG00000000457 ... ENSG00000280267 ENSG00000280433
rowData names(0):
colnames(26): 07_02_2017_6D_S1_L001 1405_S6_S36_L006 ... 3004_S6_S17_L004 3007S6_S29
colData names(18): Sampl.name RUN ... SAMPLE.TYPE Condition
> design(Blood.CD19.coding) <- ~ Condition
> GeneCounts.Blood.CD19 <- counts(Blood.CD19.coding)
> idx.nz.Blood.CD19 <- apply(GeneCounts.Blood.CD19, 1, function(x) { all(x >0)})
> sum(idx.nz.Blood.CD19)
[1] 7749
> Blood.CD19.ratio <- estimateSizeFactors(Blood.CD19.coding)
using 'avgTxLength' from assays(dds), correcting for library size
> sizeFactors(Blood.CD19.ratio)
NULL
> Blood.CD19.iterate <- estimateSizeFactors(Blood.CD19.coding, type = ("iterate"))
> sizeFactors(Blood.CD19.iterate)
07_02_2017_6D_S1_L001 1405_S6_S36_L006 1412_S6_S17_L003 1606_S6_S21_L004
1.7455888 0.5103565 1.2959369 0.7646775
2001_S6_S29_L005 2409_S6_S47_L008 27_10_2016_6A_S30_L005 0406_S6_S30_L006
0.7822790 0.7170913 1.2903509 0.5022084
0508_S6_S43_L008 0511_S6_S9_L002 07_02_2017_6F_S10_L002 0705_S6_S18_L004
0.4575517 1.5178593 1.8893295 0.6809406
0810_S6_S12_L003 09_02_2017_6D_S29_L006 09_02_2017_6L_S42_L008 1410_S6_S1_L001
0.7387591 1.9627270 1.4965968 0.8661562
1609_S6_S16_L003 17_02_2017_6_S23_L005 1707_S6_S22_L004 19_10_2016_6B_S39_L007
0.7735276 1.5968769 0.5657576 2.2309920
2105_S6_S24_L005 23_02_2017_6C_S11_L003 23_02_2017_6F_S15_L003 27_10_2016_6B_S34_L006
0.8620617 1.3122498 0.9754176 1.7786902
3004_S6_S17_L004 3007S6_S29
0.4959165 1.2226773
> idx.Blood.CD19 <- rowSums(counts(Blood.CD19.iterate, normalize = TRUE) >= 10) >=6
> Blood.CD19.keep <- Blood.CD19.coding[idx.Blood.CD19, ]
> nrow(Blood.CD19.keep)
[1] 12804
> Blood.CD19.keep <- estimateSizeFactors(Blood.CD19.keep)
using 'avgTxLength' from assays(dds), correcting for library size
> sizeFactors(Blood.CD19.keep)
NULL
> idx.Blood.CD19.all.rows <- rowSums(counts(Blood.CD19.iterate, normalize = TRUE) >= 10) >=26
> Blood.CD19.all.rows <- Blood.CD19.coding[idx.Blood.CD19.all.rows , ]
> nrow(Blood.CD19.all.rows)
[1] 6416
> Blood.CD19.all.rows <- estimateSizeFactors(Blood.CD19.all.rows)
using 'avgTxLength' from assays(dds), correcting for library size
> sizeFactors(Blood.CD19.all.rows)
NULL
> sessionInfo()
R version 3.4.0 (2017-04-21)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
Matrix products: default
locale:
[1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252
[3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C
[5] LC_TIME=English_United Kingdom.1252
attached base packages:
[1] grid parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] sva_3.24.4 BiocParallel_1.10.1 mgcv_1.8-22
[4] nlme_3.1-131 BiocInstaller_1.26.1 biomaRt_2.32.1
[7] Rgraphviz_2.20.0 plyr_1.8.4 topGO_2.28.0
[10] SparseM_1.77 GO.db_3.4.1 graph_1.54.0
[13] genefilter_1.58.1 fdrtool_1.2.15 org.Hs.eg.db_3.4.1
[16] pheatmap_1.0.8 hexbin_1.27.1 PoiClaClu_1.0.2
[19] factoextra_1.0.5 RColorBrewer_1.1-2 geneplotter_1.54.0
[22] annotate_1.54.0 XML_3.98-1.9 AnnotationDbi_1.38.2
[25] Hmisc_4.0-3 ggplot2_2.2.1 Formula_1.2-2
[28] survival_2.41-3 lattice_0.20-35 DESeq2_1.16.1
[31] SummarizedExperiment_1.6.5 DelayedArray_0.2.7 matrixStats_0.52.2
[34] Biobase_2.36.2 GenomicRanges_1.28.6 GenomeInfoDb_1.12.3
[37] IRanges_2.10.5 S4Vectors_0.14.7 BiocGenerics_0.22.1
[40] rjson_0.2.15 readr_1.1.1 tximport_1.4.0
loaded via a namespace (and not attached):
[1] bitops_1.0-6 bit64_0.9-7 tools_3.4.0 backports_1.1.1
[5] R6_2.2.2 rpart_4.1-11 DBI_0.7 lazyeval_0.2.1
[9] colorspace_1.3-2 nnet_7.3-12 gridExtra_2.3 bit_1.1-12
[13] compiler_3.4.0 htmlTable_1.9 scales_0.5.0 checkmate_1.8.5
[17] stringr_1.2.0 digest_0.6.12 foreign_0.8-69 XVector_0.16.0
[21] base64enc_0.1-3 pkgconfig_2.0.1 htmltools_0.3.6 limma_3.32.10
[25] htmlwidgets_0.9 rlang_0.1.4 RSQLite_2.0 bindr_0.1
[29] acepack_1.4.1 RCurl_1.95-4.8 magrittr_1.5 GenomeInfoDbData_0.99.0
[33] Matrix_1.2-12 Rcpp_0.12.13 munsell_0.4.3 stringi_1.1.6
[37] zlibbioc_1.22.0 blob_1.1.0 ggrepel_0.7.0 splines_3.4.0
[41] hms_0.3 locfit_1.5-9.1 knitr_1.17 latticeExtra_0.6-28
[45] data.table_1.10.4-3 gtable_0.2.0 assertthat_0.2.0 xtable_1.8-2
[49] tibble_1.3.4 memoise_1.1.0 bindrcpp_0.2 cluster_2.0.6