Entering edit mode
Guest User
★
13k
@guest-user-4897
Last seen 10.2 years ago
Hi,
I have been using the IlluminaHumanMethylation450kprobe reference
manual to find the nearest TSS and corresponding EntrezGene ID
information for analysis of my 450k data, but I have a question.
The distance to TSS file I have generated contains 485512 rows, but
the 450k array covers 485577 CpGs - why is there a discrepancy? I used
the code from the reference manual as follows:
# find the nearest TSS and its corresponding EntrezGene ID
library(GenomicFeatures)
CpGs.unstranded = CpGs.450k
strand(CpGs.unstranded) = "*"
refgene.TxDb = makeTranscriptDbFromUCSC("refGene", genome="hg19")
# nearest forward TSS
TSS.forward = transcripts(refgene.TxDb,
vals=list(tx_strand="+"),
columns="gene_id")
nearest.fwd = precede(CpGs.unstranded, TSS.forward)
nearest.fwd.eg = nearest.fwd # to keep dimensions right
notfound = whichis.na(nearest.fwd)) # track for later
nearest.fwd.eg[-notfound] = as.character(elementMetadata(TSS.forward)$
gene_id[nearest.fwd[-notfound]])
TSSs.fwd = start(TSS.forward[nearest.fwd[-notfound]])
distToTSS.fwd = nearest.fwd # to keep dimensions right
distToTSS.fwd[-notfound] = start(CpGs.unstranded)[-notfound] -
TSSs.fwd
# note that these are NEGATIVE -- which is correct!
# nearest reverse TSS
TSS.reverse = transcripts(refgene.TxDb,
vals=list(tx_strand="-"),
columns="gene_id")
nearest.rev = precede(CpGs.unstranded, TSS.reverse)
nearest.rev.eg = nearest.rev # to keep dimensions right
notfound = whichis.na(nearest.rev)) # track for later
nearest.rev.eg[-notfound] = as.character(elementMetadata(TSS.reverse)$
gene_id[nearest.rev[-notfound]])
TSSs.rev = start(TSS.reverse[nearest.rev[-notfound]])
distToTSS.rev = nearest.rev # to keep dimensions right
distToTSS.rev[-notfound] = start(CpGs.unstranded)[-notfound] -
TSSs.rev
# now these are POSITIVE: we are walking up the opposite strand.
# tabulate and link these together for the annotation package:
IlluminaHumanMethylation450kprobe$fwd.dist <- distToTSS.fwd
IlluminaHumanMethylation450kprobe$fwd.gene_id <- nearest.fwd.eg
IlluminaHumanMethylation450kprobe$rev.dist <- distToTSS.rev
IlluminaHumanMethylation450kprobe$rev.gene_id <- nearest.rev.eg
FWD.CLOSER = with(IlluminaHumanMethylation450kprobe,
union( which( abs(fwd.dist) < abs(rev.dist) ),
which( is.na(rev.dist) ) ) )
REV.CLOSER = with(IlluminaHumanMethylation450kprobe,
union( which( abs(fwd.dist) > abs(rev.dist) ),
which( is.na(fwd.dist) ) ) )
IlluminaHumanMethylation450kprobe$DISTTOTSS =
pmin(abs(IlluminaHumanMethylation450kprobe$fwd.dist),
abs(IlluminaHumanMethylation450kprobe$rev.dist))
IlluminaHumanMethylation450kprobe$ENTREZ = NA
IlluminaHumanMethylation450kprobe$ENTREZ[FWD.CLOSER] =
IlluminaHumanMethylation450kprobe$fwd.gene_id
IlluminaHumanMethylation450kprobe$ENTREZ[REV.CLOSER] =
IlluminaHumanMethylation450kprobe$rev.gene_id
write.table(IlluminaHumanMethylation450kprobe$DISTTOTSS,
"DistToTSS.csv", sep=",")
write.table(IlluminaHumanMethylation450kprobe$ENTREZ, "ENTREZ.csv",
sep=",")
write.table(IlluminaHumanMethylation450kprobe$ENTREZ[FWD.CLOSER],
"Fwd.Closer.csv", sep=",")
write.table(IlluminaHumanMethylation450kprobe$ENTREZ[REV.CLOSER],
"Rev.Closer.csv", sep=",")
Thanks in advance.
-- output of sessionInfo():
R version 2.15.2 (2012-10-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)
locale:
[1] LC_COLLATE=English_Australia.1252 LC_CTYPE=English_Australia.1252
LC_MONETARY=English_Australia.1252
[4] LC_NUMERIC=C LC_TIME=English_Australia.1252
attached base packages:
[1] parallel stats graphics grDevices utils datasets
methods base
other attached packages:
[1] SNPlocs.Hsapiens.dbSNP.20110815_0.99.6
IlluminaHumanMethylation450kprobe_2.0.6
[3] IlluminaHumanMethylation450kannotation.ilmn.v1.2_0.1.3
IlluminaHumanMethylation450k.db_1.4.7
[5] org.Hs.eg.db_2.8.0
RSQLite_0.11.2
[7] DBI_0.2-5
IlluminaHumanMethylation450kmanifest_0.4.0
[9] BSgenome.Hsapiens.UCSC.hg19_1.3.19
BSgenome_1.26.1
[11] GEOquery_2.24.1
GenomicFeatures_1.10.2
[13] AnnotationDbi_1.20.5
minfi_1.4.0
[15] Biostrings_2.26.3
GenomicRanges_1.10.7
[17] IRanges_1.16.6
reshape_0.8.4
[19] plyr_1.8
lattice_0.20-13
[21] Biobase_2.18.0
BiocGenerics_0.4.0
[23] BiocInstaller_1.8.3
loaded via a namespace (and not attached):
[1] affyio_1.26.0 annotate_1.36.0 beanplot_1.1
biomaRt_2.14.0 bit_1.1-9
[6] bitops_1.0-5 codetools_0.2-8 crlmm_1.16.9
ellipse_0.3-7 ff_2.2-10
[11] foreach_1.4.0 genefilter_1.40.0 grid_2.15.2
iterators_1.0.6 limma_3.14.4
[16] MASS_7.3-23 Matrix_1.0-11 matrixStats_0.6.2
mclust_4.0 multtest_2.14.0
[21] mvtnorm_0.9-9994 nor1mix_1.1-3 oligoClasses_1.20.0
preprocessCore_1.20.0 R.methodsS3_1.4.2
[26] RColorBrewer_1.0-5 RcppEigen_0.3.1.2.1 RCurl_1.95-3
Rsamtools_1.10.2 rtracklayer_1.18.2
[31] siggenes_1.32.0 splines_2.15.2 stats4_2.15.2
survival_2.36-14 tools_2.15.2
[36] XML_3.95-0.1 xtable_1.7-1 zlibbioc_1.4.0
--
Sent via the guest posting facility at bioconductor.org.