I want to use rtracklayer to import data from a bigwig file within certain ranges, but I get an error when the bigwig file does not contain data for chrY and chrY is present in the seqlevels of the GRanges object I'm using.
I realise that this is HeLa data, so chrY should have no data on it anyway. I also realise that it's a good idea to tell users when the seqlevels of their "which" ranges and the data they're trying to import don't match, in case it's entirely the wrong genome... I don't understand why it's an error to have "which" ranges where the seqlevels include a chromosome which isn't present in the bigwig file, even if none of the ranges are on that chromosome -- should this be a warning instead?
Example:
#bigwig file from ENCODE,
#available here: http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeSydhTfbs/
fn <- "wgEncodeSydhTfbsHelas3Rad21IggrabSig.bw"
#dput of first ten ranges of my real data
test_gr <- new("GRanges"
, seqnames = new("Rle"
, values = structure(1L, .Label = c("chr1", "chr2", "chr3", "chr4", "chr5",
"chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13",
"chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
"chr21", "chr22", "chrX", "chrY"), class = "factor")
, lengths = 10L
, elementMetadata = NULL
, metadata = list()
)
, ranges = new("IRanges"
, start = c(5295L, 950781L, 1297502L, 1685479L, 1690824L, 1709330L, 1835632L,
1845935L, 1895474L, 1976700L)
, width = c(10000L, 10000L, 10000L, 10000L, 10000L, 10000L, 10000L, 10000L,
10000L, 10000L)
, NAMES = NULL
, elementType = "integer"
, elementMetadata = NULL
, metadata = list()
)
, strand = new("Rle"
, values = structure(3L, .Label = c("+", "-", "*"), class = "factor")
, lengths = 10L
, elementMetadata = NULL
, metadata = list()
)
, elementMetadata = new("DataFrame"
, rownames = NULL
, nrows = 10L
, listData = structure(list(length = c(940L, 3071L, 2534L, 1254L, 1003L, 1106L,
1908L, 1798L, 2210L, 2375L), summit = c(226L, 955L, 1706L, 540L,
513L, 689L, 990L, 935L, 1368L, 1196L), tags = c(426L, 220L, 91L,
99L, 34L, 36L, 153L, 250L, 475L, 302L), X.10.log10.pvalue. = c(1115.24,
73.32, 256.5, 72.1, 58.18, 56.09, 60.68, 553.67, 3100, 360.59
), fold_enrichment = c(7.7, 4.89, 6.59, 4.5, 5.83, 4.89, 4.72,
8.27, 61.16, 9.33), FDR... = c(0, 3.83, 0.96, 3.9, 6.71, 7.41,
6.21, 0, 0, 0)), .Names = c("length", "summit", "tags", "X.10.log10.pvalue.",
"fold_enrichment", "FDR..."))
, elementType = "ANY"
, elementMetadata = NULL
, metadata = list()
)
, seqinfo = new("Seqinfo"
, seqnames = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8",
"chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15",
"chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22",
"chrX", "chrY")
, seqlengths = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)
, is_circular = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA)
, genome = c(NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
)
)
, metadata = list()
)
x <- import("wgEncodeSydhTfbsHelas3Rad21IggrabSig.bw", which = test_gr)
## Error in .local(con, format, text, ...) :
## 'which' contains sequence names not known to BigWig file: chrY
sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-unknown-linux-gnu (64-bit)
Running under: Debian GNU/Linux stretch/sid
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 parallel stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] rtracklayer_1.28.5 GenomicRanges_1.20.5 GenomeInfoDb_1.4.1
[4] IRanges_2.2.5 S4Vectors_0.6.1 BiocGenerics_0.14.0
loaded via a namespace (and not attached):
[1] XML_3.98-1.2 Rsamtools_1.20.4 Biostrings_2.36.1
[4] bitops_1.0-6 GenomicAlignments_1.4.1 futile.options_1.0.0
[7] zlibbioc_1.14.0 XVector_0.8.0 futile.logger_1.4.1
[10] lambda.r_1.1.7 BiocParallel_1.2.6 tools_3.2.1
[13] RCurl_1.95-4.6

Great, thank you!