Hi all,
I think there is a bug in the kegga()
function when passing the prior.prob/trend argument. I came across this before when starting from the gometh()
function with some methylation data and posted it to the support site here (prior.prob has opposite effects in goana and kegga?) but didn't get any answers. I've come across this again with some RNA-Seq data that I was trying to adjust for the DE bias caused by increased average expression level. Running goana()
on a DGELRT object with trend = TRUE
instead of FALSE led to fewer GO terms at a particular significance level as I would have expected, but running kegga()
on the same DGELRT object with trend = TRUE
instead of FALSE led to many more KEGG terms at a particular significance level. I have a reproducible example of this below, and in my own data set I verified that a pathway with genes with higher expression levels had it's p-value decrease with trend = TRUE, which is opposite of what it should do if kegga() is removing the bias towards high expression genes. Is this a bug?
Thanks,
Jenny
#Reproducible code taken from edgeR workflow codes: http://www.bioconductor.org/help/workflows/RnaSeqGeneEdgeRQL/RnaSeqGeneEdgeRQL.R > ## ----sra----------------------------------------------------------------- > library(RnaSeqGeneEdgeRQL) Loading required package: edgeR Loading required package: limma Loading required package: gplots Attaching package: ‘gplots’ The following object is masked from ‘package:stats’: lowess Loading required package: org.Mm.eg.db Loading required package: AnnotationDbi Loading required package: stats4 Loading required package: BiocGenerics Loading required package: parallel Attaching package: ‘BiocGenerics’ The following objects are masked from ‘package:parallel’: clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB The following object is masked from ‘package:limma’: plotMA The following objects are masked from ‘package:stats’: IQR, mad, sd, var, xtabs The following objects are masked from ‘package:base’: anyDuplicated, append, as.data.frame, cbind, colMeans, colnames, colSums, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min Loading required package: Biobase Welcome to Bioconductor Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'. Loading required package: IRanges Loading required package: S4Vectors Attaching package: ‘S4Vectors’ The following object is masked from ‘package:gplots’: space The following object is masked from ‘package:base’: expand.grid Loading required package: GO.db > targetsFile <- system.file("extdata", "targets.txt", package="RnaSeqGeneEdgeRQL") > targets <- read.delim(targetsFile, stringsAsFactors=FALSE) > > ## ----group--------------------------------------------------------------- > group <- paste(targets$CellType, targets$Status, sep=".") > group <- factor(group) > > ## ----checkdownload, echo=FALSE, results="hide", message=FALSE------------ > if( !file.exists("GSE60450_Lactation-GenewiseCounts.txt.gz") ) { + FileURL <- paste( + "http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE60450", + "format=file", + "file=GSE60450_Lactation-GenewiseCounts.txt.gz", + sep="&") + download.file(FileURL, method="libcurl", "GSE60450_Lactation-GenewiseCounts.txt.gz") + } > > ## ----download, eval=FALSE------------------------------------------------ > ## FileURL <- paste( > ## "http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE60450", > ## "format=file", > ## "file=GSE60450_Lactation-GenewiseCounts.txt.gz", > ## sep="&") > ## download.file(FileURL, "GSE60450_Lactation-GenewiseCounts.txt.gz") > > ## ----readcounts---------------------------------------------------------- > GenewiseCounts <- read.delim("GSE60450_Lactation-GenewiseCounts.txt.gz", + row.names="EntrezGeneID") > colnames(GenewiseCounts) <- substring(colnames(GenewiseCounts),1,7) > > ## ----DGEList, message=FALSE---------------------------------------------- > library(edgeR) > y <- DGEList(GenewiseCounts[,-1], group=group, + genes=GenewiseCounts[,1,drop=FALSE]) > options(digits=3) > > ## ----symbols, message=FALSE---------------------------------------------- > library(org.Mm.eg.db) > y$genes$Symbol <- mapIds(org.Mm.eg.db, rownames(y), + keytype="ENTREZID", column="SYMBOL") 'select()' returned 1:1 mapping between keys and columns > > ## ----dropNAsymbols------------------------------------------------------- > y <- y[!is.na(y$genes$Symbol), ] > > ## ----keep---------------------------------------------------------------- > keep <- rowSums(cpm(y) > 0.5) >= 2 > > ## ----filter-------------------------------------------------------------- > y <- y[keep, , keep.lib.sizes=FALSE] > > ## ----norm---------------------------------------------------------------- > y <- calcNormFactors(y) > > > ## ----design-------------------------------------------------------------- > design <- model.matrix(~0+group) > colnames(design) <- levels(group) > > ## ----estimateDisp-------------------------------------------------------- > y <- estimateDisp(y, design, robust=TRUE) > > ## ----glmQLFit------------------------------------------------------------ > fit <- glmQLFit(y, design, robust=TRUE) > > ## ----B.PvsL-------------------------------------------------------------- > B.LvsP <- makeContrasts(B.lactating-B.pregnant, levels=design) > > ## ----treat--------------------------------------------------------------- > tr <- glmTreat(fit, contrast=B.LvsP, lfc=log2(1.5)) > > ## ----goana--------------------------------------------------------------- > go.F <- goana(tr, species="Mm", trend = FALSE) #the default for trend > go.T <- goana(tr, species="Mm", trend = TRUE, plot = TRUE) #plot not attached, but low expression has less enrichment > > sum(go.F$P.Down < 1e-3) [1] 310 > sum(go.T$P.Down < 1e-3) [1] 287 > #decrease in number of GO terms when adjusting for expression level trend > > ## ----kegga--------------------------------------------------------------- > keg.F <- kegga(tr, species="Mm", trend = FALSE) #the default for trend > keg.T <- kegga(tr, species="Mm", trend = TRUE, plot = TRUE) > > sum(keg.F$P.Down < 1e-3) [1] 5 > sum(keg.T$P.Down < 1e-3) [1] 65 > #big increase in number of KEGG pathways when adjusting for same expression level trend > > ## ----info---------------------------------------------------------------- > sessionInfo() R version 3.4.0 (2017-04-21) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows >= 8 x64 (build 9200) Matrix products: default locale: [1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 [3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C [5] LC_TIME=English_United States.1252 attached base packages: [1] parallel stats4 stats graphics grDevices utils datasets methods base other attached packages: [1] RnaSeqGeneEdgeRQL_1.0.0 GO.db_3.4.1 org.Mm.eg.db_3.4.1 AnnotationDbi_1.38.1 [5] IRanges_2.10.2 S4Vectors_0.14.3 Biobase_2.36.2 BiocGenerics_0.22.0 [9] gplots_3.0.1 edgeR_3.18.1 limma_3.32.2 loaded via a namespace (and not attached): [1] Rcpp_0.12.11 splines_3.4.0 statmod_1.4.30 bit_1.1-12 lattice_0.20-35 [6] BiasedUrn_1.07 rlang_0.1.1 blob_1.1.0 caTools_1.17.1 tools_3.4.0 [11] grid_3.4.0 KernSmooth_2.23-15 DBI_0.7 gtools_3.5.0 digest_0.6.12 [16] bit64_0.9-7 tibble_1.3.3 bitops_1.0-6 memoise_1.1.0 RSQLite_2.0 [21] gdata_2.18.0 compiler_3.4.0 locfit_1.5-9.1 pkgconfig_2.0.1