Hi Dennis,
Here's some code for doing control probe filtering from one of my
pipelines
cheers,
Mark
#########################################################
#' Determine the type of each probeset, using data from a Probe Design
Info package (eg pd.hugene.1.0.st.v1).
#'
#' Each probeset is one of the following types: \cr
#' - main\cr
#' - normgene->intron\cr
#' - normgene->exon\cr
#' - rescue->FLmRNA->unmapped\cr
#' - control->affx\cr
#' - control->bgp->antigenomic\cr
#' - low-coverage\cr
#' \cr
#' Probeset levels\cr
#' Gene ST arrays only have core, or exon-level probesets (ie
\code{level='probeset'}).
#' Exon ST arrays have 3 levels of transcript-level probe qualities:
core, extended, full,
#' in addition to exon-level probesets.
#' \cr
#' Missing probes\cr
#' Sometimes there are probesets on the array (ie in the CEL files,
and in the pgf/clf)
#' which are not in the pd Info Package.
#' This is true for at least the MoGene 1.0 ST array. If you provide a
vector
#' of all probeset ID's, then those that are not in the pd info
package will be labelled
#' 'low-coverage' probesets.\cr
#' see here for more info:\cr
#' \url{
http://thread.gmane.org/gmane.science.biology.informatics.cond
uctor/19591}
#' AFAIK, HuGene array does not suffer from this, not sure about
RaGene.
#' Probe types\cr
#'
#' @param pd.package The name, or the actual object of a
\code{AffyGenePDInfo}, or
#' \code{AffyExonPDInfo} environment; see examples.
#' @param probesets an optional vector of all probesets of interest.
See details.
#' @param level Which probeset level are you interested in? One of
core, extended, full or probeset,
#' where the first 3 correspond to transcript-level probesets, and
4th = exon-level probesets.
#'
#' @return a \code{data.frame} with 2 columns: \dQuote{probeset_id}
and \dQuote{type}, where type is one of
#' the values mentioned in details.
#' @author Mark Cowley, 2010-02-03
#' @examples
#' \dontrun{
#'
#' ## transcript-level analysis on Gene ST arrays
#' a <- pd.package.probeset2type("pd.hugene.1.0.st.v1")
#' a <- pd.package.probeset2type( get("pd.hugene.1.0.st.v1") )
#' sum(a$type =="main")
#' # [1] 28869
#' # this is illegal on Gene ST arrays
#' a <- pd.package.probeset2type("pd.hugene.1.0.st.v1",
level="extended")
#' a <- pd.package.probeset2type("pd.hugene.1.0.st.v1", level="full")
#'
#' ## exon-level analysis of GeneST arrays
#' a <- pd.package.probeset2type("pd.hugene.1.0.st.v1",
level="probeset")
#' sum(a$type =="main")
#' # [1] 253002
#'
#' ## transcript-level analysis on Exon ST arrays
#' a <- pd.package.probeset2type("pd.huex.1.0.st.v2", level="core")
#' sum(a$type =="main")
#' # [1] 17859
#' a <- pd.package.probeset2type("pd.huex.1.0.st.v2",
level="extended")
#' sum(a$type =="main")
#' # [1] 129532
#' a <- pd.package.probeset2type("pd.huex.1.0.st.v2", level="full")
#' sum(a$type =="main")
#' # [1] 262266
#'
#' ## exon-level analysis of Exon ST arrays
#' a <- pd.package.probeset2type("pd.huex.1.0.st.v2",
level="probeset")
#' sum(a$type =="main")
#' # [1] 1407090
#'
#' }
pd.package.probeset2type <- function(pd.package, probesets=NULL,
level=c("core", "extended", "full", "probeset")[1]) {
!missing(pd.package) || stop("pd.package must be supplied")
if ( class(pd.package)[1] == "character" ) {
require(pd.package, character.only=TRUE) ||
stop("Can't load the pd.package")
pd.package <- get(pd.package)
}
if( ! class(pd.package)[1] %in% c("AffyGenePDInfo",
"AffyExonPDInfo") )
stop("Unsupported argument. It should be a
AffyGenePDInfo, or AffyExonPDInfo object, eg pd.hugene.1.0.st.v1, or
pd.huex.1.0.st.v2.\n")
if( class(pd.package)[1] == "AffyGenePDInfo" && level %in%
c("extended", "full") )
stop("Unsupported probesets argument. Can't choose
core or extended for an AffyGenePDInfo package\n")
conn <- db(pd.package)
# conn <- db(pd.hugene.1.0.st.v1)
probe2type <- switch(level,
core=dbGetQuery(conn,
paste("SELECT DISTINCT meta_fsetid AS
probeset_id, type_id AS type",
"FROM featureSet, core_mps, type_dict",
"WHERE featureSet.fsetid=core_mps.fsetid",
"AND featureSet.type=type_dict.type")),
extended=dbGetQuery(conn,
paste("SELECT DISTINCT
meta_fsetid AS probeset_id, type_id AS type",
"FROM featureSet, extended_mps,
type_dict",
"WHERE
featureSet.fsetid=extended_mps.fsetid",
"AND
featureSet.type=type_dict.type")),
full=dbGetQuery(conn,
paste("SELECT DISTINCT meta_fsetid AS
probeset_id, type_id AS type",
"FROM featureSet, full_mps,
type_dict",
"WHERE
featureSet.fsetid=full_mps.fsetid",
"AND
featureSet.type=type_dict.type")),
probeset= dbGetQuery(conn,
paste("SELECT DISTINCT fsetid AS
probeset_id, type_id AS type",
"FROM featureSet, type_dict",
"WHERE
featureSet.type=type_dict.type"))
)
## there are sometimes probes that are not in the pd info
package (eg the Mogene array has 38 'low-coverage-probes'.)
## see:
http://thread.gmane.org/gmane.science.biology.informat
ics.conductor/19591/
lowcoverage.probes <- setdiff(probesets, probe2type[,1]) #
works even if probesets==NULL
if( length(lowcoverage.probes) > 0 ) {
tmp <- data.frame(probeset_id=lowcoverage.probes, type
="low-coverage")
nrow(tmp)
# [1] 38
probe2type <- rbind(probe2type, tmp)
}
if( !is.null(probesets) ) {
probe2type <- probe2type[match(probesets,
probe2type$probeset_id), ]
}
else {
probe2type <- probe2type[order(probe2type[,1]), ]
# head(probe2type)
# # probeset_id type
# # 1 10338001 control->affx
# # 2 10338002 control->bgp->antigenomic
# # 3 10338003 control->affx
# # 4 10338004 control->affx
# # 5 10338005 control->bgp->antigenomic
# # 6 10338006 control->bgp->antigenomic
}
return( probe2type )
}
######################################################################
##########
library(oligo)
probesets <- "core"
controls <- c("exclude", "include")[1]
cel.files <- list.celfiles(getwd())
chiptypes <- sapply(cel.files, function(x) oligo:::getCelChipType(x,
useAffyio=TRUE))
pd.pkg.name <- cleanPlatformName(chiptypes[1]) # oligo function
ExprFeatSet <- read.celfiles(filenames=cel.files, verbose=FALSE,
checkType=FALSE)
xdata <- rma(ExprFeatSet, target=probesets)
##########################################
# Classify probesets by their control type.
# This works for meta-probesets, and exon-level probesets.
#
probe2type <- pd.package.probeset2typepd.pkg.name,
probesets=featureNames(xdata), level=probesets)
counts <- sort(table(probe2type$type), decreasing=TRUE)
counts <- data.frame(category=sprintf("%26s", names(counts)),
Freq=counts, stringsAsFactors=FALSE)
rownames(counts) <- NULL
cat("Breakdown of number of probesets vs category:\n")
print(counts)
cat("\n")
# main 28869
# normgene->intron 2904
# normgene->exon 1195
# rescue->FLmRNA->unmapped 227
# control->affx 57
# control->bgp->antigenomic 45
##########################################
##########################################
# throw away the control probes??
# ... and low-coverage probes? (
http://thread.gmane.org/gmane.
science.biology.informatics.conductor/19591/)
#
if( controls == "exclude" ) {
cat(sprintf("Excluding %d control probesets, leaving
%d probesets.\n\n", sum(probe2type$type != "main"),
sum(probe2type$type == "main")))
probe.ids <-
as.character(probe2type$probeset_id[probe2type$type == "main"])
probe.ids <- featureNames(xdata) %in% probe.ids
xdata <- subset(xdata, probe.ids)
}
##########################################
On 16/02/2013, at 8:16 AM, Dennis Burian <dburian at="" cox.net=""> wrote:
> I have a large time course experiment on Gene 1.0 ST GeneChips that
I'm
> analyzing with the oligo and timecourse packages in Bioconductor. In
my
> experimental design, this analysis method determines genes most
likely
> to have different temporal expression patterns between two
biological
> conditions.
>
> Where I'm running into trouble is many of my highest scoring genes
are
> intronic and exonic controls. I surmise that the software is scoring
> these probe sets highly because they are "expressed" at very low but
> noisy levels. Examination of the expression profiles confirms this
> hypothesis, expression levels between 0 and 4 so I'm not concerned
about
> quality of the data.
>
>> From the oligo.pdf manual in rma-methods, core is the default
setting
> for target. My question is whether there is another setting for
target
> that would filter out the intronic/exonic control probe sets prior
to
> differential expression testing? Is subset implemented and if so
what
> flag would I use to take advantage of it?
>
>> sessionInfo()
> R version 2.15.0 (2012-03-30)
> Platform: x86_64-redhat-linux-gnu (64-bit)
>
> locale:
> [1] LC_CTYPE=en_US.utf8 LC_NUMERIC=C
> [3] LC_TIME=en_US.utf8 LC_COLLATE=en_US.utf8
> [5] LC_MONETARY=en_US.utf8 LC_MESSAGES=en_US.utf8
> [7] LC_PAPER=C LC_NAME=C
> [9] LC_ADDRESS=C LC_TELEPHONE=C
> [11] LC_MEASUREMENT=en_US.utf8 LC_IDENTIFICATION=C
>
> attached base packages:
> [1] stats graphics grDevices utils datasets methods
> base
>
> other attached packages:
> [1] pd.hugene.1.0.st.v1_3.6.0 RSQLite_0.11.2
> [3] DBI_0.2-5 timecourse_1.28.0
> [5] MASS_7.3-22 oligo_1.20.4
> [7] oligoClasses_1.18.0
>
> loaded via a namespace (and not attached):
> [1] affxparser_1.28.1 affyio_1.24.0 Biobase_2.16.0
> [4] BiocGenerics_0.2.0 BiocInstaller_1.4.9 Biostrings_2.24.1
> [7] bit_1.1-9 codetools_0.2-8 compiler_2.15.0
> [10] ff_2.2-10 foreach_1.4.0 IRanges_1.14.4
> [13] iterators_1.0.6 limma_3.12.3 marray_1.34.0
> [16] preprocessCore_1.18.0 splines_2.15.0 stats4_2.15.0
> [19] tools_2.15.0 zlibbioc_1.2.0
>>
>
> Many thanks,
> Dennis Burian
>
> _______________________________________________
> Bioconductor mailing list
> Bioconductor at r-project.org
>
https://stat.ethz.ch/mailman/listinfo/bioconductor
> Search the archives:
http://news.gmane.org/gmane.science.biology.informatics.conductor