Question

How to mimic biomaRt results with AnnotationHub?

0

Entering edit mode

Ezgi ▴ 60

@ezgi-24130

Last seen 3.1 years ago

United States

We have a workflow in my current institute that depends on the biomaRt package to get gene start positions for a given set of gene symbols. However, every now and then Ensembl connection fails and this causes a break in the pipeline. I would like to replace this step with the AnnotationHub method, so we can cache this information and don't rely on Ensembl anymore.

For some reason, for a given set of HGNC symbols, I get more hits from biomaRt compared to AnnotationHub. I am wondering how I can mimic the biomaRt steps with AnnotationHub, so I can get the same gene list back? I would also appreciate any suggestions to make my AnnotationHub code cleaner. Thanks in advance!

Here's a toy example:

Say, the following are my 100 gene symbols:

gene_names <- c("PHKA1P1", "RPL27P7", "RNA5SP210", "C21orf91", "PPP1R26P4", 
                "OR5BL1P", "LINC01348", "TPT1P13", "P3H4", "EGLN3P1", "ATP1B2", 
                "LINC00471", "GLT8D1", "DGKZP1", "RNU105C", "CDRT2", "GEMIN8P1", 
                "RNY5P10", "TOPBP1", "TRE-CTC6-1", "RPS12P5", "KIF3A", "RNA5SP259", 
                "YAP1P2", "DDX18P2", "ITCH", "TBC1D3C", "ERCC6L2", "USP2", 
                "TMEM108-AS1", "OR2A9P", "JAKMIP2", "MIR155HG", "MIR7844", 
                "MRPS15", "TMEM132A", "MYL6P1", "DCAF8", "SLC31A1P1", "PRDX4", 
                "FIZ1", "PSMD9", "KCNJ13", "RPL23AP33", "PIRC63", "ZC2HC1B", 
                "RPS25P6", "RPSAP42", "RNU7-89P", "KAT2B", "SP6", "SLC6A6", 
                "CLDN4", "TRBV3-2", "CSMD2-AS1", "MIR7112", "SENCR", "ARL2BPP1", 
                "RAB1A", "SNHG9", "POM121L7", "RNASE12", "CFAP74", "HHIP", "KRT8P6", 
                "TIFA", "LINC01349", "MIR3190", "KRTAP5-6", "HLTF", "MORC3", 
                "MIR1281", "RASGEF1A", "RBM26", "ZNF286A", "OR2A15P", "NHP2", 
                "MIR3910-1", "MRPL30", "TPP1", "ADAM5", "ZNF826P", "RNASE11", 
                "IGSF10", "NEK1", "KRTAP5-3", "VN1R112P", "OR51A3P", "CLCC1", 
                "RBMY2WP", "CT47A6", "LGALS14", "CBFB", "ANKRD49", "TRBV7-2", 
                "OSBPL2", "SLC9A9", "KLF15", "OPN1LW", "SCCPDH")
length(gene_names)

Here's how I get the gene start position with biomaRt, which returns 89 genes:

library(biomaRt)
ensembl <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host = "uswest.ensembl.org")
k <- getBM(attributes = c('hgnc_symbol', 'chromosome_name', 'start_position'), filters = 'hgnc_symbol', values=gene_names, mart=ensembl)

# some additional steps to normalize chr names
k$chromosome_name[k$chromosome_name == "X"] <- 23
k$chromosome_name[k$chromosome_name == "Y"] <- 24
k$chromosome_name <- as.numeric(as.character(k$chromosome_name))
k <- k[complete.cases(k),]

# check number of returned genes
nrow(k)
length(unique(k$hgnc_symbol))

Here's my attempt to create the same data.frame with AnnotationHub, which returns 64 genes:

library(AnnotationHub)
ah <- AnnotationHub()
bmrt <- query(ah, "BioMart")
txdb <- bmrt[["AH52256"]]
geneid <- bmrt[["AH69106"]]
colnames(geneid) <- c("entrezid", "hgnc_symbol")

seqlevelsStyle(txdb) <- "NCBI"
gr <- genes(txdb, columns=c("TXCHROM"), filter=list("gene_id"=paste0("GeneID:", geneid$entrezid[geneid$hgnc_symbol %in% gene_names])))
gr <- keepSeqlevels(gr, value = c(1:22, "X", "Y"))

genepos_df <- data.frame(chromosome_name=as.factor(as.character(mcols(gr)$TXCHROM)),
                start_position=start(gr),
                entrezid=gsub("GeneID:","", gr@ranges@NAMES))
genepos_df <- merge(geneid, genepos_df)
levels(genepos_df$chromosome_name) <- 1:24
genepos_df$chromosome_name <- as.character(genepos_df$chromosome_name)
genepos_df$entrezid <- NULL

nrow(genepos_df)

And my session info:

sessionInfo( )

R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.6

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] GenomicFeatures_1.42.0 AnnotationDbi_1.52.0   Biobase_2.50.0         GenomicRanges_1.42.0   GenomeInfoDb_1.26.0    IRanges_2.24.0         S4Vectors_0.28.0      
 [8] AnnotationHub_2.22.0   BiocFileCache_1.14.0   dbplyr_1.4.4           BiocGenerics_0.36.0    biomaRt_2.46.0        

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.5                    lattice_0.20-41               prettyunits_1.1.1             Rsamtools_2.6.0               Biostrings_2.58.0            
 [6] assertthat_0.2.1              digest_0.6.25                 mime_0.9                      R6_2.4.1                      RSQLite_2.2.1                
[11] httr_1.4.2                    pillar_1.4.6                  zlibbioc_1.36.0               rlang_0.4.7                   progress_1.2.2               
[16] curl_4.3                      rstudioapi_0.11               blob_1.2.1                    Matrix_1.2-18                 BiocParallel_1.24.0          
[21] stringr_1.4.0                 RCurl_1.98-1.2                bit_4.0.4                     DelayedArray_0.16.0           shiny_1.5.0                  
[26] compiler_4.0.2                httpuv_1.5.4                  rtracklayer_1.50.0            pkgconfig_2.0.3               askpass_1.1                  
[31] htmltools_0.5.0               SummarizedExperiment_1.20.0   openssl_1.4.2                 tidyselect_1.1.0              tibble_3.0.3                 
[36] GenomeInfoDbData_1.2.4        interactiveDisplayBase_1.28.0 matrixStats_0.57.0            XML_3.99-0.5                  crayon_1.3.4                 
[41] dplyr_1.0.1                   later_1.1.0.1                 GenomicAlignments_1.26.0      bitops_1.0-6                  rappdirs_0.3.1               
[46] grid_4.0.2                    xtable_1.8-4                  lifecycle_0.2.0               DBI_1.1.0                     magrittr_1.5                 
[51] stringi_1.4.6                 XVector_0.30.0                promises_1.1.1                xml2_1.3.2                    ellipsis_0.3.1               
[56] generics_0.0.2                vctrs_0.3.2                   tools_4.0.2                   bit64_4.0.5                   glue_1.4.1                   
[61] purrr_0.3.4                   BiocVersion_3.12.0            MatrixGenerics_1.2.0          hms_0.5.3                     fastmap_1.0.1                
[66] yaml_2.2.1                    BiocManager_1.30.10           memoise_1.1.0

biomaRt AnnotationHub • 1.5k views

ADD COMMENT • link 4.4 years ago Ezgi ▴ 60

score 1 · Accepted Answer · 2020-11-11

Any time you are doing something like gr@ranges@NAMES you should probably try to find the actual accessor. Digging around in S4 objects like that is probably suboptimal.

Anyway, you can just use an EnsDb object to do that in one shot.

> z <- hub[["AH83216"]]
> z
EnsDb for Ensembl:
|Backend: SQLite
|Db type: EnsDb
|Type of Gene ID: Ensembl Gene ID
|Supporting package: ensembldb
|Db created by: ensembldb package from Bioconductor
|script_version: 0.3.5
|Creation time: Fri Aug 21 21:07:29 2020
|ensembl_version: 101
|ensembl_host: localhost
|Organism: Homo sapiens
|taxonomy_id: 9606
|genome_build: GRCh38
|DBSCHEMAVERSION: 2.1
| No. of genes: 67990.
| No. of transcripts: 252335.
|Protein data available.
> zz <- genes(z)
> zzz <- zz[zz$gene_name %in% gene_names]
> length(zzz)
[1] 97
> as.data.frame(zzz)[,c(7,1,2)]
                  gene_name             seqnames     start
ENSG00000142609      CFAP74                    1   1921951
ENSG00000231163   CSMD2-AS1                    1  33868953
ENSG00000116898      MRPS15                    1  36455718
ENSG00000232882     PHKA1P1                    1  90892992
ENSG00000230402   LINC01349                    1 100627049
ENSG00000121940       CLCC1                    1 108927361
ENSG00000132716       DCAF8                    1 160215715
ENSG00000280587   LINC01348                    1 235065479
ENSG00000143653      SCCPDH                    1 246724409
ENSG00000198915    RASGEF1A                   10  43194535
ENSG00000196224    KRTAP5-3                   11   1607565
ENSG00000205864    KRTAP5-6                   11   1697195
ENSG00000227023     OR51A3P                   11   4937371
ENSG00000166340        TPP1                   11   6612768
ENSG00000272569     OR5BL1P                   11  58171098
ENSG00000006118    TMEM132A                   11  60924460
ENSG00000168876     ANKRD49                   11  94493979
ENSG00000036672        USP2                   11 119355215
ENSG00000254703       SENCR                   11 128691672
ENSG00000256134     EGLN3P1                   12  15971890
ENSG00000283818     MIR7844                   12  94571231
ENSG00000110801       PSMD9                   12 121888732
ENSG00000179611      DGKZP1                   13  43968424
ENSG00000139746       RBM26                   13  79311824
ENSG00000173464     RNASE11                   14  20583559
ENSG00000258436     RNASE12                   14  20590193
ENSG00000255198       SNHG9                   16   1964959
ENSG00000067955        CBFB                   16  67028984
LRG_1364               CBFB                   16  67029149
ENSG00000129244      ATP1B2                   17   7646627
ENSG00000187607     ZNF286A                   17  15699577
ENSG00000278299     TBC1D3C                   17  38057693
ENSG00000141696        P3H4                   17  41801947
ENSG00000189120         SP6                   17  47844908
ENSG00000265934    ARL2BPP1                   18  76703444
ENSG00000231205     ZNF826P                   19  20340269
ENSG00000006659     LGALS14                   19  39704481
ENSG00000265134     MIR3190                   19  47226942
ENSG00000179943        FIZ1                   19  55591376
ENSG00000138069       RAB1A                    2  65070696
ENSG00000185414      MRPL30                    2  99181152
ENSG00000233741     RPL27P7                    2 119993199
ENSG00000237824   RPL23AP33                    2 184902111
ENSG00000181798   LINC00471                    2 231508422
ENSG00000115474      KCNJ13                    2 232765802
ENSG00000078747        ITCH                   20  34363241
LRG_354                ITCH                   20  34363256
ENSG00000130703      OSBPL2                   20  62231922
ENSG00000154642    C21orf91                   21  17788974
ENSG00000234883    MIR155HG                   21  25561909
ENSG00000159256       MORC3                   21  36320189
ENSG00000226543      MYL6P1                   21  43855890
ENSG00000278008   PPP1R26P4                   22  18715815
ENSG00000284015     MIR1281                   22  41092513
ENSG00000131389      SLC6A6                    3  14402576
ENSG00000114166       KAT2B                    3  20040446
ENSG00000226216     RPS12P5                    3  29390848
ENSG00000016864      GLT8D1                    3  52694488
ENSG00000163884       KLF15                    3 126342635
ENSG00000251011 TMEM108-AS1                    3 133245603
ENSG00000163781      TOPBP1                    3 133598175
ENSG00000181804      SLC9A9                    3 143265222
ENSG00000071794        HLTF                    3 149030127
ENSG00000152580      IGSF10                    3 151425384
ENSG00000224426   SLC31A1P1                    3 172603440
ENSG00000145365        TIFA                    4 112274537
ENSG00000164161        HHIP                    4 144646156
ENSG00000137601        NEK1                    4 169392809
ENSG00000131437       KIF3A                    5 132692628
ENSG00000176049     JAKMIP2                    5 147585438
ENSG00000145912        NHP2                    5 178149463
LRG_346                NHP2                    5 178149463
ENSG00000212336   RNA5SP210                    6  81622200
ENSG00000219463     RPSAP42                    6 137995270
ENSG00000118491     ZC2HC1B                    6 143864436
ENSG00000189143       CLDN4                    7  73799542
ENSG00000282939     TRBV7-2                    7 142352819
ENSG00000239981     OR2A15P                    7 144118461
ENSG00000228960      OR2A9P                    7 144294480
ENSG00000202411   RNA5SP259                    8  28530411
ENSG00000196115       ADAM5                    8  39314591
ENSG00000199212     RNU105C                    8  54330687
ENSG00000284054     MIR7112                    8 144262673
ENSG00000266855   MIR3910-1                    9  91636251
ENSG00000182150     ERCC6L2                    9  95871264
ENSG00000288273      CT47A6      CHR_HG439_PATCH 120877498
ENSG00000275920    KRTAP5-3   CHR_HSCHR11_1_CTG6   1613275
ENSG00000281191    KRTAP5-3 CHR_HSCHR11_2_CTG1_1   1607591
ENSG00000277389    KRTAP5-6 CHR_HSCHR11_2_CTG1_1   1694414
ENSG00000282506     TRBV7-2    CHR_HSCHR7_2_CTG6 142374511
ENSG00000123131       PRDX4                    X  23664262
ENSG00000236080      YAP1P2                    X 115069237
ENSG00000226023      CT47A6                    X 120958165
ENSG00000213498     TPT1P13                    X 121863698
ENSG00000214930      KRT8P6                    X 138487665
ENSG00000102076      OPN1LW                    X 154144243
ENSG00000230727     RBMY2WP                    Y  22762851