Hi all,
I have an experiment with 28 human samples spread over 2 experimental batches. One variable of interest, trt, and 6 other possible variables: expt, sex, age, race, smoker_yrs and cigs_week. The question is how to pick which variables to include in the model when different genes may show different effects of the variables? Instead of trying to pick one best model for all genes, I had the idea to first divide the genes into modules using WGCNA, and then use the module eigengenes in a BIC stepwise selection to arrive at a best model for each module. Only some of the modules had the variable of interest, trt, included in the best model, and for these genes I want to get fold-change values for trt. My approach was to run separate limma/voom analyses on each set of genes in the module using the best model but I'm not sure if this is a good idea to estimate the voom weights and the squeezed variance estimates using in some cases only a few dozen genes. Would it be better to run all genes with the model trt + sex, which was the best model for modules 12 and 16, pull out the FC values just for the genes in 12 and 16, then re-run all genes with model expt + trt + race, which was the best model for module 13, pull out the FC for just genes in module 13, and so on? Example codes below.
Thanks in advance for any advice,
Jenny
# counts with TMM norm factors in object d.filt; variables in object targets v <- voom(d.filt) datExpr <- t(v$E) powers = c(c(1:10), seq(from = 12, to=30, by=2)) sft <- pickSoftThreshold(datExpr, powerVector = powers, verbose = 5, networkType = "signed hybrid") source("C:/RNASeq/plotSFT.R") # I just put the plotting code from WGCNA's tutorial I.2a in a function x11(width = 9, height = 5) plotSFT(sft, lheight = 0.8) #decide to use power = 5; run WGCNA with deepSplit = 4 and no merging to get the most modules net.ds4 <- blockwiseModules(datExpr, power = 5, maxBlockSize = 25000, #should be larger than num. of genes networkType = "signed hybrid", TOMType = "signed", loadTOM = FALSE, saveTOMs = TRUE, saveTOMFileBase = "TOM_BIC_ds4", corType = "pearson", deepSplit = 4, minModuleSize = 20, mergeCutHeight = 0, numericLabels = T, verbose = 3 ) # Get MEs (rows) by samples (columns), in ME order MEs0.ds4.all = t(moduleEigengenes(datExpr, net.ds4$colors)$eigengenes) colnames(MEs0.ds4.all) <- rownames(datExpr) # For each modules MEs, run BIC stepwise regression with all 7 possible variables best.mod.ds4.all <- list() for (i in 1:nrow(MEs0.ds4.all)) { x <- data.frame(MEs = MEs0.ds4.all[i,], age = targets$age, smoker_yrs = targets$smoker_yrs, cigs_week = targets$cigs_week, trt = factor(targets$trt), sex = factor(targets$sex), expt = factor(targets$expt), race = factor(targets$race == 2)) attach(x) n <- nrow(x) temp.b1 <- step(lm(MEs ~ 1), MEs ~ age + smoker_yrs + cigs_week + trt + sex+ expt + race, k = log(n), data = x) # k = log(n) turns AIC into BIC best.mod.ds4.all[[i]] <- temp.b1 detach(x) } #Find which ones have trtS as coef which(sapply(best.mod.ds4.all, function(x) sum(names(x$coef) %in% "trtS"))==1) #ME10 ME12 ME13 ME16 ME23 ME24 ME28 ME29 ME48 # 11 13 14 17 24 25 29 30 49 #Fit the best model to each gene in each of these modules - do separate limma models for each module temp <- which(sapply(best.mod.ds4.all, function(x) sum(names(x$coef) %in% "trtS"))==1) i <- 1 des.temp <- model.matrix(formula(best.mod.ds4.all[[temp[i]]]), data = best.mod.ds4.all[[temp[i]]]$model) v.temp <- voom(d.filt[net.ds4$colors == (temp[i]-1),], des.temp) fit.temp <- eBayes(lmFit(v.temp, des.temp)) trt.mod.ds4.sep <- cbind(topTable(fit.temp, coef = "trtS", num = Inf, sort.by = "none"), module = temp[i]-1, bestModel = as.character(formula(best.mod.ds4.all[[temp[i]]]))[3]) for (i in 2:length(temp)) { des.temp <- model.matrix(formula(best.mod.ds4.all[[temp[i]]]), data = best.mod.ds4.all[[temp[i]]]$model) v.temp <- voom(d.filt[net.ds4$colors == (temp[i]-1),], des.temp) fit.temp <- eBayes(lmFit(v.temp, des.temp)) trt.mod.ds4.sep <- rbind(trt.mod.ds4.sep, cbind(topTable(fit.temp, coef = "trtS", num = Inf, sort.by = "none"), module = temp[i]-1, bestModel = as.character(formula(best.mod.ds4.all[[temp[i]]]))[3])) } #alternatively, do modeling on all genes but only pull out FC for the genes in the selected modules temp <- which(sapply(best.mod.ds4.all, function(x) sum(names(x$coef) %in% "trtS"))==1) i <- 1 des.temp <- model.matrix(formula(best.mod.ds4.all[[temp[i]]]), data = best.mod.ds4.all[[temp[i]]]$model) v.temp <- voom(d.filt, des.temp) fit.temp <- eBayes(lmFit(v.temp, des.temp)) trt.mod.ds4.all <- cbind(topTable(fit.temp, coef = "trtS", num = Inf, sort.by = "none")[net.ds4$colors == (temp[i]-1),], module = temp[i]-1, bestModel = as.character(formula(best.mod.ds4.all[[temp[i]]]))[3]) for (i in 2:length(temp)) { des.temp <- model.matrix(formula(best.mod.ds4.all[[temp[i]]]), data = best.mod.ds4.all[[temp[i]]]$model) v.temp <- voom(d.filt[net.ds4$colors == (temp[i]-1),], des.temp) fit.temp <- eBayes(lmFit(v.temp, des.temp)) trt.mod.ds4.all <- rbind(trt.mod.ds4.all, cbind(topTable(fit.temp, coef = "trtS", num = Inf, sort.by = "none")[net.ds4$colors == (temp[i]-1),], module = temp[i]-1, bestModel = as.character(formula(best.mod.ds4.all[[temp[i]]]))[3])) }
Better to split your tags to limma and voom separately, otherwise relevant people don't get notified.
ok - I split them. I only picked it because it came up as a tag! A case of error propogation...