Does anyone know whether there is something wrong with the coding below to extract minor allele data from grch38 ? The data is apparently available on Ensembl (e.g. https://www.ensembl.org/Homo_sapiens/Variation/Exploredb=core;r=10:120308754-120309754;v=rs10788066;vdb=variation;vf=654619804) but the data extraction method doesn't appear to extract minor allele data if I use host = "https://www.ensembl.org/" - it gives NAs for minor allele and minor allele frequency - is there something else I should do ? Or is there a better method ? Or is the minor allele data from grch37 actually up-to-date ? I give several examples - the first gives edge effects (which I presume is not so important) with only two SNPs analyzed. The second gives the required output (but from grch37). The third gives all NAs for minor allele data from grch38 - I presume the formulation of snp.db3 is correct although I'm waiting for some feedback ! Alternatively perhaps this has not been implemented for grch38 ? (this appears to be of considerable importance ???) I include three other parameters ("chrom_start", "chrom_strand", "associated_gene") which are giving correct results for nt.biomart2 and nt.biomart3 (but edge effect for "associated gene" with nt.biomart1).
nt.biomart1 - edge effects:
refsnp_id minor_allele minor_allele_freq chrom_start chrom_strand. associated_gene
1 rs10788066 TRUE 0.465056 122068766 1 NA
2 rs765840164 NA NA 21830103 1 NA
nt.biomart2 - from grch37 (only first 10 rows are shown here):
refsnp_id minor_allele minor_allele_freq chrom_start chrom_strand associated_gene
1 rs1064213 A 0.379393 198950240 1
2 rs1064213 A 0.379393 198950240 1 PLCL1
3 rs1106090 G 0.472045 58068741 1 EIF2S2P7,VRK2
4 rs1106090 G 0.472045 58068741 1
5 rs12089815 G 0.380391 91189933 1
6 rs12089815 G 0.380391 91189933 1 BARHL2
7 rs12140153 T 0.026558 62579891 1
8 rs12140153 T 0.026558 62579891 1 <NA>
9 rs12140153 T 0.026558 62579891 1 PATJ
10 rs12140153 T 0.026558 62579891 1 INADL
nt.biomart3 - from grch38 (only first 10 rows shown here):
refsnp_id minor_allele minor_allele_freq chrom_start chrom_strand associated_gene
1 rs1064213 NA NA 198085516 1
2 rs1064213 NA NA 198085516 1 PLCL1
3 rs1106090 NA NA 57841606 1 EIF2S2P7,VRK2
4 rs1106090 NA NA 57841606 1
5 rs12089815 NA NA 90724376 1
6 rs12089815 NA NA 90724376 1 BARHL2
7 rs12140153 NA NA 62114219 1 PATJ
8 rs12140153 NA NA 62114219 1
9 rs12140153 NA NA 62114219 1 INADL
10 rs12140153 NA NA 62114219 1 <NA>
library("biomaRt")
## nt.biomart1 - edge effects:
snp.db1 <- useMart(host = "https://grch37.ensembl.org", biomart = "ENSEMBL_MART_SNP", dataset = "hsapiens_snp")
nt.biomart1 <- getBM(attributes = c("refsnp_id", "minor_allele", "minor_allele_freq", "chrom_start", "chrom_strand", "associated_gene"), filters = c("snp_filter"), values = c("rs10788066", "rs765840164"), mart = snp.db1, uniqueRows = TRUE)
nt.biomart1
## nt.biomart2 - from grch37:
nt.biomart2 <- getBM(attributes = c("refsnp_id", "minor_allele", "minor_allele_freq", "chrom_start", "chrom_strand", "associated_gene"), filters = c("snp_filter"), values = c("rs3762444", "rs284262", "rs655598", "rs12089815", "rs12140153", "rs788163", "rs1064213", "rs1106090", "rs7557796", "rs16825008"), mart = snp.db1, uniqueRows = TRUE)
nt.biomart2
## nt.biomart3 - from grch38:
snp.db3 <- useMart(host = "https://www.ensembl.org/", biomart = "ENSEMBL_MART_SNP", dataset = "hsapiens_snp")
nt.biomart3 <- getBM(attributes = c("refsnp_id", "minor_allele", "minor_allele_freq", "chrom_start", "chrom_strand", "associated_gene"), filters = c("snp_filter"), values = c("rs3762444", "rs284262", "rs655598", "rs12089815", "rs12140153", "rs788163", "rs1064213", "rs1106090", "rs7557796", "rs16825008"), mart = snp.db3, uniqueRows = TRUE)
nt.biomart3
sessionInfo( )
R version 4.3.3 (2024-02-29)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.4
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Europe/Berlin
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] biomaRt_2.58.2
loaded via a namespace (and not attached):
[1] rappdirs_0.3.3 utf8_1.2.4 generics_0.1.3
[4] bitops_1.0-7 xml2_1.3.6 RSQLite_2.3.6
[7] stringi_1.8.4 hms_1.1.3 digest_0.6.35
[10] magrittr_2.0.3 fastmap_1.2.0 blob_1.2.4
[13] progress_1.2.3 AnnotationDbi_1.64.1 GenomeInfoDb_1.38.8
[16] DBI_1.2.2 httr_1.4.7 purrr_1.0.2
[19] fansi_1.0.6 XML_3.99-0.16.1 Biostrings_2.70.3
[22] cli_3.6.2 rlang_1.1.3 crayon_1.5.2
[25] dbplyr_2.5.0 XVector_0.42.0 Biobase_2.62.0
[28] bit64_4.0.5 withr_3.0.0 cachem_1.0.8
[31] tools_4.3.3 memoise_2.0.1 dplyr_1.1.4
[34] GenomeInfoDbData_1.2.11 filelock_1.0.3 BiocGenerics_0.48.1
[37] curl_5.2.1 vctrs_0.6.5 R6_2.5.1
[40] png_0.1-8 stats4_4.3.3 lifecycle_1.0.4
[43] BiocFileCache_2.10.2 zlibbioc_1.48.2 KEGGREST_1.42.0
[46] stringr_1.5.1 S4Vectors_0.40.2 IRanges_2.36.0
[49] bit_4.0.5 pkgconfig_2.0.3 pillar_1.9.0
[52] glue_1.7.0 tibble_3.2.1 tidyselect_1.2.1
[55] compiler_4.3.3 prettyunits_1.2.0 RCurl_1.98-1.14