I'm using edgeR + genefilter. I have used it for a few different projects already but for this one I'm getting a chosen theta = 0.6. I wonder if this is something that I need to be concerned of? It just feels like a lot of genes!
Session Info here:
sessionInfo() R version 3.2.5 (2016-04-14) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows 7 x64 (build 7601) Service Pack 1 locale: [1] LC_COLLATE=English_United States.1252 [2] LC_CTYPE=English_United States.1252 [3] LC_MONETARY=English_United States.1252 [4] LC_NUMERIC=C [5] LC_TIME=English_United States.1252 attached base packages: [1] parallel stats4 stats graphics grDevices utils datasets [8] methods base other attached packages: [1] plyr_1.8.3 pvclust_2.0-0 statmod_1.4.24 [4] org.Hs.eg.db_3.2.3 RSQLite_1.0.0 DBI_0.4 [7] AnnotationDbi_1.32.3 IRanges_2.4.8 S4Vectors_0.8.11 [10] Biobase_2.30.0 BiocGenerics_0.16.1 edgeR_3.12.1 [13] limma_3.26.9 genefilter_1.52.1 BiocInstaller_1.20.1 loaded via a namespace (and not attached): [1] Rcpp_0.12.4 lattice_0.20-33 XML_3.98-1.4 grid_3.2.5 [5] xtable_1.8-2 annotate_1.48.0 Matrix_1.2-6 splines_3.2.5 [9] tools_3.2.5 survival_2.39-2
Here is the code:
library(edgeR)
library(genefilter)
countsFile <- file.path("htseqcount_result.txt")
sampleInfoFile <- file.path("groupinfo.csv")
outdir <- file.path("edgeR")
counts <- read.table(countsFile, sep = "\t", header = TRUE, row.names=1,check.names = FALSE)
sampleInfo$Group <- sampleInfo$condition
#################
## Run edgeR ##
#################
## Construct DGEList
d <- DGEList(counts=counts)
## Make design matrix
Group = factor(sampleInfo$Group)
design <- model.matrix(~0+Group)
colnames(design) <- levels(Group)
rownames(design) <- colnames(counts)
my.contrasts = makeContrasts(Treated_vs_Untreated=Treated-Untreated,levels=design)
d <- calcNormFactors(d)
d <- estimateGLMCommonDisp(d, design)
d <- estimateGLMTagwiseDisp(d, design)
## Fit to model
fit <- glmFit(d, design)
alpha = 0.1
FDRcutoff = 0.05
logFCCutoff = 1
####################
## Run analysis ##
####################
lrt <- glmLRT(fit, contrast=my.contrasts)
dgeTable = topTags(lrt, n=nrow(d))
## try genefilter:
filtResult = tryGeneFilter(forcedManualFiltering = FALSE, min_reads = 5, nGroup = 4,
dgeList = d,testResultDgeTable =dgeTable, alpha = alpha,
contrast = contrast, designMatrix = design,
processPlots2screen = TRUE,
processPlots2pdf = TRUE,
pdf_filepath = file.path(output.dir,paste(compPair,"_genefilterProcess.pdf",sep="")))
if (filtResult$method == "manualFilter") ## manual filtering
{
d2 <- filtResult$dgeFiltered
d2 <- calcNormFactors(d2)
d2 <- estimateGLMCommonDisp(d2, design)
d2 <- estimateGLMTagwiseDisp(d2, design)
fit2 <- glmFit(d2, design)
lrt2 <- glmLRT(fit2, contrast=contrast)
dgeTable2 = topTags(lrt2, n=nrow(d))
resultTable = dgeTable2
} else if((filtResult$method == "genefilter"))
{
resultTable <- filtResult$adjustedTestResultTable
}
##############################################
# Function
##############################################
tryGeneFilter <- function( forcedManualFiltering = FALSE, min_reads = 5, nGroup = 3, dgeList,
testResultDgeTable, filter_method = "rowSums",
contrast = NA, designMatrix = NA,
theta = seq(from=0, to=0.8, by=0.01),alpha = 0.1,
processPlots2screen = TRUE, processPlots2pdf = FALSE, pdf_filepath = NA
)
{
#################################################################################
## Args for manual filtering: ##
## Filtering genes with low expression / zero count. At least 1/nGroup ##
## of the samples have at least min_reads or more reads. Filter base on ##
## the lib.size ##
## - min_reads ##
## - nGroup : generally the number of different groups ##
## - dgeList ##
## Args for genefilter: ##
## - dgeList ##
## - testResultDgeTable ##
## - filter_method ##
## - contrast: if not NA, only calculate filterstats based on selected ##
## samples that are used to run the test. e.g., contrast used ##
## for design matrix. If provided, design matrix should also ##
## be provided.It's a vector of 0, -1, 1, with names matching ##
## column names of the design matrix. ##
## - designMatrix: the design matrix used for running the glm test. ##
## - theta: the sequence of tested percentage for gene filter. ##
## - alpha: FDR cut-off used for maximizing rejected genes ## ##
## Return: ##
## - If using manual filter: return new filtered dge list for fitting to ##
## the model and the "keep" vector ##
## - If using genefilter: return the new result table and also the keep ##
## vector. ##
#################################################################################
## ------------------------- ##
## Manual filtering ##
## ------------------------- ##
n_samples = ncol(dgeList)
cpm_cutoff = min_reads/(max(dgeList$samples$lib.size)/1000000)
keep_manual = rowSums(cpm(dgeList)>cpm_cutoff) >= round(n_samples/nGroup)
if (forcedManualFiltering)
{
## Use manual filtering:
d <- dge[keep_manual,]
print("Forced manual filtering. Filtered dge object returned. Fit model again.")
return(list("dgeFiltered" = d, "keep" = keep_manual, "method" = "manualFilter"))
} else {
## ------------------------- ##
## Gene filter ##
## ------------------------- ##
require(genefilter)
require(edgeR)
filter_methods <- c("rowSums")
if (!filter_method %in% filter_methods)
{
stop(paste("filter_methods should be one of the following:",paste(filter_methods, collapse=","),sep=""))
} else if (filter_method == "rowSums")
{
if(length(contrast)==1)
{
## All samples used for making filterstats. E.g. when exact test is used, or when there are only 2 groups
if(is.na(contrast))
{
filterstats = rowSums(dgeList$counts[rownames(testResultDgeTable),])
} else
{
warning("Input contrast is not NA but has only length=1")
}
} else
{
## Only use a subset of samples corresponding to the one that the test was run on.
## First check:
if (class(designMatrix)!="matrix")
{
stop("designMatrix needs to be a matrix when contrast is provided.")
} else
{
if (sum(names(contrast) %in% colnames(design))<length(contrast))
{
stop("Names of contrast needs to be in column names of design matrix")
} else
{
includedGroups = names(contrast)[contrast!=0]
includedSamplesVector = rep(0,length = nrow(design))
names(includedSamplesVector) <- rownames(design)
for (g in includedGroups)
{
includedSamplesVector = includedSamplesVector + design[,g]
}
includedSamples = names(includedSamplesVector)[includedSamplesVector>0]
filterstats = rowSums(d$counts[rownames(dgeTable),includedSamples])
}
}
}
}
res = data.frame(
filterstat = filterstats,
pvalue = as.data.frame(testResultDgeTable)$PValue,
row.names = rownames(testResultDgeTable))
# pdf(paste("genefilterFilterstat_",contrast[2],"_vs_",contrast[1],".pdf", sep="") , w=11, h=8.5)
# print(with(res,plot(rank(filterstat)/length(filterstat), -log10(pvalue), pch=16, cex=0.45)))
# dev.off()
trsf = function(n) log10(n+1)
# plot(ecdf(trsf(res$filterstat)), xlab=body(trsf), main="")
pBH = filtered_p(filter=res$filterstat, test=res$pvalue, theta=theta, method="BH")
rejBH = filtered_R(alpha=alpha, filter=res$filterstat, test=res$pvalue, theta=theta, method="BH")
#####################
## Print plots ##
#####################
if (processPlots2screen)
{
rejection_plot(pBH, at="sample",
xlim=c(min(theta), max(theta)), ylim=c(0, 2000),
xlab="FDR cutoff (Benjamini & Hochberg adjusted p-value)", main="")
plot(theta, rejBH, type="l",xlab=expression(theta), ylab= paste("number of rejections (alpha=",alpha,")",sep=""))
}
if(processPlots2pdf)
{
if(is.na(pdf_filepath))
{
pdf("genefilter.pdf")
} else
{
pdf(pdf_filepath)
}
rejection_plot(pBH, at="sample",
xlim=c(min(theta), max(theta)), ylim=c(0, 2000),
xlab="FDR cutoff (Benjamini & Hochberg adjusted p-value)", main="")
plot(theta, rejBH, type="l",xlab=expression(theta), ylab= paste("number of rejections (alpha=",alpha,")",sep=""))
dev.off()
}
#######################################################################
## Decide which theta to choose based on maximum rejection number ##
#######################################################################
# if(max(rejBH)==0 | rejBH["0%"]== max(rejBH))
if(max(rejBH)==0)
{
## Use manual filtering:
d <- dge[keep_manual,]
notes = "Manual filtering chosen. Filtered dge object returned. Fit model again."
print(notes)
return(list("dgeFiltered" = d, "keep" = keep_manual, "method" = "manualFilter", "notes" = notes))
} else {
theta_chosen = theta[max(which(rejBH==max(rejBH)))]
use = res$filterstat > quantile(res$filterstat, probs=theta_chosen)
names(use) <- rownames(res)
adjustedTestResult = as.data.frame(testResultDgeTable[use, ])
newFDR <- p.adjust(adjustedTestResult[,"PValue"], method = "BH")
adjustedTestResult[,"FDR"] <- newFDR
notes = "Genefilter used. adjusted test result (dgeTable) returned with new FDR."
print(notes)
return(list("adjustedTestResultTable" = adjustedTestResult, "keep" = use, "method" = "genefilter", "theta_chosen" = theta_chosen, "notes" = notes))
# use <- use[rownames(dge)]
# dgeFilt=dge[use,]
# d <- dgeFilt
}
}
}

A vague question like that can only be responded to with a vague answer. If you want actual help, you need to say what exactly you mean by 'using edgeR + genefilter', preferably by showing your code.
Thanks! Just edited the question and attached the code I used.