Hi all,

I've come across some unexpected results when trying to go over-representation testing on array methylation data, which I've traced to the underlying `goana() `

and `kegga() `

functions from limma. Genes can have different numbers of CpGs, and so the `gometh() `

function from missMethyl will by default calculate the probability of a gene being called significant based on the number of CpGs for that gene and then pass this as the `prior.prob`

argument for `goana `

or `kegga`

. For GO testing, adjusting for this prior.prob decreases the number of significant GO terms as I would have expected. However, for KEGG pathways, adjusting for the prior.prob greatly increases the number of significant pathways. Even randomly selected sets of CpGs have over half of the 319 KEGG pathways with raw p < 0.001 when adjusting for prior.prob, but only a dozen pathways when not adjusted. How is this possible? Reproducible example below.

Thanks,

Jenny

# codes for creating the necessary objects from the missMethyl vignette: ## ----load-libs, message=FALSE-------------------------------------------- library(missMethyl) library(limma) library(minfi) ## ----reading-data, message=FALSE----------------------------------------- library(minfiData) baseDir <- system.file("extdata", package = "minfiData") targets <- read.metharray.sheet(baseDir) rgSet <- read.metharray.exp(targets = targets) ## ----ppraw--------------------------------------------------------------- mSet <- preprocessRaw(rgSet) ## ----diffmeth2----------------------------------------------------------- # get M-values for ALL probes meth <- getMeth(mSet) unmeth <- getUnmeth(mSet) M <- log2((meth + 100)/(unmeth + 100)) grp <- factor(targets$status,levels=c("normal","cancer")) des <- model.matrix(~grp) INCs <- getINCs(rgSet) Mc <- rbind(M,INCs) ctl <- rownames(Mc) %in% rownames(INCs) rfit1 <- RUVfit(data=Mc, design=des, coef=2, ctl=ctl) # Stage 1 analysis rfit2 <- RUVadj(rfit1) ## ----ruv1---------------------------------------------------------------- top1 <- topRUV(rfit2, num=Inf) ctl <- rownames(M) %in% rownames(top1[top1$p.ebayes.BH > 0.5,]) ## ----ruv2---------------------------------------------------------------- # Perform RUV adjustment and fit rfit1 <- RUVfit(data=M, design=des, coef=2, ctl=ctl) # Stage 2 analysis rfit2 <- RUVadj(rfit1) ## ----gometh3------------------------------------------------------------- topCpGs<-topRUV(rfit2,number=10000) sigCpGs <- rownames(topCpGs) ## ----gometh4------------------------------------------------------------- library(IlluminaHumanMethylation450kanno.ilmn12.hg19) # end missMethyl vignette # check how many GO and KEGG have raw p < 0.001 for the selected CpG set gst <- gometh(sig.cpg=sigCpGs, all.cpg=rownames(rfit2), collection="GO") sum(gst$P.DE < 0.001) # [1] 628 - with bias correction gst.nobias <- gometh(sig.cpg=sigCpGs, all.cpg=rownames(rfit2), collection="GO", prior.prob = FALSE) sum(gst.nobias$P.DE < 0.001) #[1] 1126 - without bias correction kst <- gometh(sig.cpg=sigCpGs, all.cpg=rownames(rfit2), collection="KEGG") sum(kst$P.DE < 0.001) # [1] 160 - with bias correction, out of 319 pathways total kst.nobias <- gometh(sig.cpg=sigCpGs, all.cpg=rownames(rfit2), collection="KEGG", prior.prob = FALSE) sum(kst.nobias$P.DE < 0.001) # [1] 38 - without bias correction # Pull 10 random subset of CpGs of same size rand.001 <- matrix(NA, 10, 4) colnames(rand.001) <- c("gst","gst.nobias","kst","kst.nobias") #This takes a few minutes: for(i in 1:10) { temp <- sample(rownames(rfit2), size = length(sigCpGs)) gst.rand <- gometh(sig.cpg=temp, all.cpg=rownames(rfit2), collection="GO") rand.001[i,1] <- sum(gst.rand$P.DE < 0.001) gst.nobias.rand <- gometh(sig.cpg=temp, all.cpg=rownames(rfit2), collection="GO", prior.prob = FALSE) rand.001[i,2] <- sum(gst.nobias.rand$P.DE < 0.001) kst.rand <- gometh(sig.cpg=temp, all.cpg=rownames(rfit2), collection="KEGG") rand.001[i,3] <- sum(kst.rand$P.DE < 0.001) kst.nobias.rand <- gometh(sig.cpg=temp, all.cpg=rownames(rfit2), collection="KEGG", prior.prob = FALSE) rand.001[i,4] <- sum(kst.nobias.rand$P.DE < 0.001) print(i) } colMeans(rand.001) # gst gst.nobias kst kst.nobias # 24.1 556.6 258.9 12.6 sessionInfo() R version 3.4.0 (2017-04-21) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows >= 8 x64 (build 9200) Matrix products: default locale: [1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 [3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C [5] LC_TIME=English_United States.1252 attached base packages: [1] stats4 parallel stats graphics grDevices utils datasets methods base other attached packages: [1] minfiData_0.22.0 IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.0 [3] IlluminaHumanMethylation450kmanifest_0.4.0 minfi_1.22.1 [5] bumphunter_1.16.0 locfit_1.5-9.1 [7] iterators_1.0.8 foreach_1.4.3 [9] Biostrings_2.44.0 XVector_0.16.0 [11] SummarizedExperiment_1.6.1 DelayedArray_0.2.2 [13] matrixStats_0.52.2 Biobase_2.36.2 [15] GenomicRanges_1.28.1 GenomeInfoDb_1.12.0 [17] IRanges_2.10.1 S4Vectors_0.14.1 [19] BiocGenerics_0.22.0 limma_3.32.2 [21] missMethyl_1.10.0 #the rest deleted because I was over the character limit

Thanks, Gordon! I've downloaded the new version and it's working as expected.

Cheers, Jenny