I want to use rtracklayer to import data from a bigwig file within certain ranges, but I get an error when the bigwig file does not contain data for chrY and chrY is present in the seqlevels of the GRanges object I'm using.
I realise that this is HeLa data, so chrY should have no data on it anyway. I also realise that it's a good idea to tell users when the seqlevels of their "which" ranges and the data they're trying to import don't match, in case it's entirely the wrong genome... I don't understand why it's an error to have "which" ranges where the seqlevels include a chromosome which isn't present in the bigwig file, even if none of the ranges are on that chromosome -- should this be a warning instead?
Example:
#bigwig file from ENCODE, #available here: http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeSydhTfbs/ fn <- "wgEncodeSydhTfbsHelas3Rad21IggrabSig.bw" #dput of first ten ranges of my real data test_gr <- new("GRanges" , seqnames = new("Rle" , values = structure(1L, .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY"), class = "factor") , lengths = 10L , elementMetadata = NULL , metadata = list() ) , ranges = new("IRanges" , start = c(5295L, 950781L, 1297502L, 1685479L, 1690824L, 1709330L, 1835632L, 1845935L, 1895474L, 1976700L) , width = c(10000L, 10000L, 10000L, 10000L, 10000L, 10000L, 10000L, 10000L, 10000L, 10000L) , NAMES = NULL , elementType = "integer" , elementMetadata = NULL , metadata = list() ) , strand = new("Rle" , values = structure(3L, .Label = c("+", "-", "*"), class = "factor") , lengths = 10L , elementMetadata = NULL , metadata = list() ) , elementMetadata = new("DataFrame" , rownames = NULL , nrows = 10L , listData = structure(list(length = c(940L, 3071L, 2534L, 1254L, 1003L, 1106L, 1908L, 1798L, 2210L, 2375L), summit = c(226L, 955L, 1706L, 540L, 513L, 689L, 990L, 935L, 1368L, 1196L), tags = c(426L, 220L, 91L, 99L, 34L, 36L, 153L, 250L, 475L, 302L), X.10.log10.pvalue. = c(1115.24, 73.32, 256.5, 72.1, 58.18, 56.09, 60.68, 553.67, 3100, 360.59 ), fold_enrichment = c(7.7, 4.89, 6.59, 4.5, 5.83, 4.89, 4.72, 8.27, 61.16, 9.33), FDR... = c(0, 3.83, 0.96, 3.9, 6.71, 7.41, 6.21, 0, 0, 0)), .Names = c("length", "summit", "tags", "X.10.log10.pvalue.", "fold_enrichment", "FDR...")) , elementType = "ANY" , elementMetadata = NULL , metadata = list() ) , seqinfo = new("Seqinfo" , seqnames = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY") , seqlengths = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_) , is_circular = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA) , genome = c(NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_ ) ) , metadata = list() ) x <- import("wgEncodeSydhTfbsHelas3Rad21IggrabSig.bw", which = test_gr) ## Error in .local(con, format, text, ...) : ## 'which' contains sequence names not known to BigWig file: chrY sessionInfo() R version 3.2.1 (2015-06-18) Platform: x86_64-unknown-linux-gnu (64-bit) Running under: Debian GNU/Linux stretch/sid locale: [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8 [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8 [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] stats4 parallel stats graphics grDevices utils datasets [8] methods base other attached packages: [1] rtracklayer_1.28.5 GenomicRanges_1.20.5 GenomeInfoDb_1.4.1 [4] IRanges_2.2.5 S4Vectors_0.6.1 BiocGenerics_0.14.0 loaded via a namespace (and not attached): [1] XML_3.98-1.2 Rsamtools_1.20.4 Biostrings_2.36.1 [4] bitops_1.0-6 GenomicAlignments_1.4.1 futile.options_1.0.0 [7] zlibbioc_1.14.0 XVector_0.8.0 futile.logger_1.4.1 [10] lambda.r_1.1.7 BiocParallel_1.2.6 tools_3.2.1 [13] RCurl_1.95-4.6
Great, thank you!