You can do that on the fly, using the TxDb
and the OrgDb
for that organism. Here is a function that I wrote recently to create a gene plot, followed by coverage plots for zero or more BAM files, that I use for non-model organisms.
makeGGplot <- function(gene, bams = NULL, txdb, orgdb, bamnames, geom = "gapped.pair"){
## gene is gene symbol
## bams is a vector of bamfile paths. Ensure they are indexed
## txdb and orgdb are obvious
## bamnames is a vector of names for resulting ggplot
## geom = "line" is in general faster
require("biovizBase", quietly = TRUE, character.only = TRUE)
require("ggbio", quietly = TRUE, character.only = TRUE)
egid <- mapIds(orgdb, gene, "ENTREZID", "SYMBOL")
gnrng <- genes(txdb, single.strand.genes.only = FALSE)[[egid]]
if(length(gnrng) > 1) warning("Doubledog!")
tx <- crunch(txdb, which = gnrng)
tx <- split(tx, tx$tx_id)
lbs <- select(txdb, names(tx), c("TXNAME","TXID","GENEID"), "TXID")
lbs$SYMBOL <- mapIds(orgdb, lbs$GENEID, "SYMBOL", "ENTREZID")
ind <- lbs$GENEID %in% egid
lbs <- lbs[ind,]
tx <- tx[ind]
names(tx) <- lbs$SYMBOL
gnplot <- ggplot() + do.call(geom_alignment, c(list(data = tx, names.expr = "SYMBOL")))
if(is.null(bams)){
gnplot
} else {
gnplot <- list(gnplot)
gnrng <- keepSeqlevels(gnrng, as.character(seqnames(gnrng)))
bl <- BamFileList(bams)
bamplot <- lapply(bl, autoplot, which = gnrng, geom=geom)
names(bamplot) <- bamnames
tracks(c(gnplot, bamplot), heights = c(4, rep(1, length(bams))), label.text.angle = 0, xlim = gnrng,
label.text.cex = 0.75)
}
}
Doing say
> hub <- AnnotationHub()
|======================================================================| 100%
snapshotDate(): 2019-10-29
> query(hub, c("elegans","txdb"))
AnnotationHub with 8 records
# snapshotDate(): 2019-10-29
# $dataprovider: UCSC
# $species: Caenorhabditis elegans
# $rdataclass: TxDb
# additional mcols(): taxonomyid, genome, description,
# coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
# rdatapath, sourceurl, sourcetype
# retrieve records with, e.g., 'object[["AH52249"]]'
title
AH52249 | TxDb.Celegans.UCSC.ce11.refGene.sqlite
AH52250 | TxDb.Celegans.UCSC.ce6.ensGene.sqlite
AH53778 | TxDb.Celegans.UCSC.ce11.refGene.sqlite
AH57984 | TxDb.Celegans.UCSC.ce11.refGene.sqlite
AH61790 | TxDb.Celegans.UCSC.ce11.refGene.sqlite
AH66167 | TxDb.Celegans.UCSC.ce11.refGene.sqlite
AH70583 | TxDb.Celegans.UCSC.ce11.ensGene.sqlite
AH70584 | TxDb.Celegans.UCSC.ce11.refGene.sqlite
> txdb <- hub[["AH70584"]]
> query(hub, c("elegans","orgdb"))
AnnotationHub with 2 records
# snapshotDate(): 2019-10-29
# $dataprovider: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
# $species: Drosophila elegans, Caenorhabditis elegans
# $rdataclass: OrgDb
# additional mcols(): taxonomyid, genome, description,
# coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
# rdatapath, sourceurl, sourcetype
# retrieve records with, e.g., 'object[["AH75747"]]'
title
AH75747 | org.Ce.eg.db.sqlite
AH76581 | org.Drosophila_elegans.eg.sqlite
> orgdb <- hub[["AH75747"]]
> makeGGplot("homt-1", NULL, txdb, orgdb, "nobams")
Seems to work OK.
Just to clarify, the
genesymbol
dataset is really just a convenience for demonstration purposes. Jim is right: ggbio should directly support TxDb and other annotation objects.