I would like to run ChIPQC on some ENCODE data downloaded using the ENCODExplorer package. Not all the samples have peaks available for download, however I'd still like to see e.g. % of reads in blacklist. I thought that this should be possible with ChIPQC, however I'm getting a (fairly uninformative) error, that persists even if I subset to only the samples that do have peaks.
See samplesheet, traceback, sessionInfo and code to get the data below:
samplesheet <- structure(list(Factor = c("CTCF", "CTCF", "CTCF", "CTCF"),
assay = c("ChIP-seq", "ChIP-seq", "ChIP-seq", "ChIP-seq"), Tissue =
c("CH12.LX", "CH12.LX", "CH12.LX", "CH12.LX"), Replicate = c(1L, 1L, 2L, 2L),
lab = c("Michael Snyder, Stanford", "Ross Hardison, PennState", "Michael
Snyder, Stanford", "Ross Hardison, PennState" ), organism = structure(c(3L,
3L, 3L, 3L), .Label = c("Drosophila melanogaster", "Homo sapiens", "Mus
musculus"), class = "factor"), assembly = c("mm9", "mm9", "mm9", "mm9"),
bamReads = c("ENCFF001NGB.bam", "ENCFF001MLC.bam", "ENCFF001NGJ.bam",
"ENCFF001MLD.bam"), Peaks = c(NA, "ENCFF001MLH.bed", NA, "ENCFF001MLI.bed"),
lab_short = list("Stanford", "PennState", "Stanford", "PennState"), SampleID =
c("CTCF_CH12.LX_Stanford_1", "CTCF_CH12.LX_PennState_1",
"CTCF_CH12.LX_Stanford_2", "CTCF_CH12.LX_PennState_2" )), class =
"data.frame", .Names = c("Factor", "assay", "Tissue", "Replicate", "lab",
"organism", "assembly", "bamReads", "Peaks", "lab_short", "SampleID"),
row.names = c(NA, -4L))
qc_res <- ChIPQC(samplesheet, annotation = "mm9", chromosome = "chr1",
blacklist = mm9_blacklist)
# CTCF_CH12.LX_Stanford_1 CH12.LX CTCF 1 bed
# CTCF_CH12.LX_PennState_1 CH12.LX CTCF 1 bed
# CTCF_CH12.LX_Stanford_2 CH12.LX CTCF 2 bed
# CTCF_CH12.LX_PennState_2 CH12.LX CTCF 2 bed
# Error in if (res >= minval) { : missing value where TRUE/FALSE needed
# only ones with peaks available
qc_res <- ChIPQC(samplesheet[!is.na(samplesheet$Peaks),],
samples=qc_res_list[!is.na(samplesheet$Peaks)])
# CTCF_CH12.LX_Stanford_1 CH12.LX CTCF 1 bed
# CTCF_CH12.LX_PennState_1 CH12.LX CTCF 1 bed
# CTCF_CH12.LX_Stanford_2 CH12.LX CTCF 2 bed
# CTCF_CH12.LX_PennState_2 CH12.LX CTCF 2 bed
#Error in if (res >= minval) { : missing value where TRUE/FALSE needed
# traceback()
# 6: FUN(newX[, i], ...)
# 5: apply(pv$allvectors[, 4:ncol(pv$allvectors)], 1, pv.minOverlap,
# minOverlap)
# 4: pv.vectors(model, mask = mask, minOverlap = minOverlap, bKeepAll = bKeepAll,
# bAnalysis = bAnalysis, attributes = attributes)
# 3: pv.model(DBA, mask = mask, minOverlap = minOverlap, samplesheet = sampleSheet,
# config = config, caller = peakCaller, format = peakFormat,
# scorecol = scoreCol, bLowerBetter = bLowerScoreBetter, skipLines = skipLines,
# bAddCallerConsensus = bAddCallerConsensus, bRemoveM = bRemoveM,
# bRemoveRandom = bRemoveRandom, bKeepAll = TRUE, bAnalysis = TRUE,
# filter = filter, attributes = attributes)
# 2: dba(sampleSheet = experiment, bCorPlot = FALSE, peakCaller = "bed")
# 1: ChIPQC(samplesheet, annotation = "mm9", chromosome = "chr1",
# blacklist = mm9_blacklist)
sessionInfo()
# R version 3.2.3 (2015-12-10)
# Platform: x86_64-pc-linux-gnu (64-bit)
# Running under: Debian GNU/Linux stretch/sid
#
# locale:
# [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=en_GB.UTF-8
# [4] LC_COLLATE=en_GB.UTF-8 LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
# [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C LC_ADDRESS=C
# [10] LC_TELEPHONE=C LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats4 parallel stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] TxDb.Mmusculus.UCSC.mm9.knownGene_3.2.2 GenomicFeatures_1.22.13
# [3] AnnotationDbi_1.32.3 dplyr_0.4.3
# [5] tidyr_0.4.0 ChIPQC_1.6.1
# [7] DiffBind_1.16.3 RSQLite_1.0.0
# [9] DBI_0.3.1 locfit_1.5-9.1
# [11] GenomicAlignments_1.6.3 Rsamtools_1.22.0
# [13] Biostrings_2.38.4 XVector_0.10.0
# [15] limma_3.26.8 SummarizedExperiment_1.0.2
# [17] Biobase_2.30.0 GenomicRanges_1.22.4
# [19] GenomeInfoDb_1.6.3 IRanges_2.4.8
# [21] S4Vectors_0.8.11 BiocGenerics_0.16.1
# [23] ggplot2_2.0.0 ENCODExplorer_1.2.1
# [25] BiocInstaller_1.20.1
#
# loaded via a namespace (and not attached):
# [1] edgeR_3.12.0 jsonlite_0.9.19 splines_3.2.3 gtools_3.5.0
# [5] assertthat_0.1 highr_0.5.1 latticeExtra_0.6-28 amap_0.8-14
# [9] RBGL_1.46.0 Category_2.36.0 backports_1.0.0 lattice_0.20-33
# [13] digest_0.6.9 RColorBrewer_1.1-2 checkmate_1.7.2 colorspace_1.2-6
# [17] Matrix_1.2-3 plyr_1.8.3 GSEABase_1.32.0 chipseq_1.20.0
# [21] XML_3.98-1.3 pheatmap_1.0.8 ShortRead_1.28.0 biomaRt_2.26.1
# [25] genefilter_1.52.1 zlibbioc_1.16.0 xtable_1.8-2 GO.db_3.2.2
# [29] scales_0.4.0 brew_1.0-6 gdata_2.17.0 BiocParallel_1.4.3
# [33] annotate_1.48.0 lazyeval_0.1.10 survival_2.38-3 magrittr_1.5
# [37] systemPipeR_1.4.8 fail_1.3 gplots_2.17.0 hwriter_1.3.2
# [41] GOstats_2.36.0 graph_1.48.0 tools_3.2.3 BBmisc_1.9
# [45] stringr_1.0.0 sendmailR_1.2-1 munsell_0.4.3 lambda.r_1.1.7
# [49] caTools_1.17.1 futile.logger_1.4.1 grid_3.2.3 RCurl_1.95-4.7
# [53] rjson_0.2.15 AnnotationForge_1.12.2 bitops_1.0-6 base64enc_0.1-3
# [57] gtable_0.2.0 R6_2.1.2 reshape2_1.4.1 Nozzle.R1_1.1-1
# [61] knitr_1.12.3 rtracklayer_1.30.2 futile.options_1.0.0 KernSmooth_2.23-15
# [65] stringi_1.0-1 BatchJobs_1.6 Rcpp_0.12.3
# Code for downloading data and preparing samplesheet:
library(ENCODExplorer)
library(ChIPQC)
## Get data
biosamples <- c("CH12.LX")
targets <- c("CTCF")
formats <- c("bam", "bigBed")
res <- queryEncode(assay = "ChIP-seq", organism = "Mus musculus", fixed = FALSE,
biosample = paste(biosamples, collapse = "|"),
target = paste(targets, collapse = "|"),
file_format = paste(formats, collapse = "|"))
#pretty table
cols <- c("target","assay", "biosample_name", "biological_replicate_number",
"file_format","lab", "organism", "assembly", "file_accession")
knitr::kable(res$experiment[, cols])
#download data
downloadEncode(resultSet = res,
resultOrigin = "queryEncode", dir = ".")
## Make samplesheet, etc
samplesheet <- res$experiment[, cols] %>%
spread(key = file_format, value = file_accession) %>% #rename for ChIPQC conventions
rename(bamReads = bam, Peaks = bigBed, Tissue = biosample_name,
Factor = target, Replicate = biological_replicate_number) %>%
mutate(bamReads = paste0(bamReads, ".bam"),
Peaks = ifelse(is.na(Peaks), NA, paste0(Peaks, ".bed")),
Factor = gsub("-mouse", "", Factor)) %>%
mutate(lab_short = lapply(lab, function(x) strsplit(x, ", ")[[1]][2])) %>% #just institution
mutate(SampleID = paste(Factor, Tissue, lab_short, Replicate, sep = "_")) #make Sample ID
bed_files <- unique(samplesheet$Peaks[!is.na(samplesheet$Peaks)])
for (bf in bed_files){
bb <- sub(".bed", ".bigBed", bf)
message("converting file: ", bb)
system(paste("bigBedToBed", bb, bf, sep = " "))
}
mm9_blacklist <- makeGRangesFromDataFrame(read.table("mm9-blacklist.bed",
col.names = c("chr", "start", "end")))
