I would like to run ChIPQC on some ENCODE data downloaded using the ENCODExplorer package. Not all the samples have peaks available for download, however I'd still like to see e.g. % of reads in blacklist. I thought that this should be possible with ChIPQC, however I'm getting a (fairly uninformative) error, that persists even if I subset to only the samples that do have peaks.
See samplesheet, traceback, sessionInfo and code to get the data below:
samplesheet <- structure(list(Factor = c("CTCF", "CTCF", "CTCF", "CTCF"), assay = c("ChIP-seq", "ChIP-seq", "ChIP-seq", "ChIP-seq"), Tissue = c("CH12.LX", "CH12.LX", "CH12.LX", "CH12.LX"), Replicate = c(1L, 1L, 2L, 2L), lab = c("Michael Snyder, Stanford", "Ross Hardison, PennState", "Michael Snyder, Stanford", "Ross Hardison, PennState" ), organism = structure(c(3L, 3L, 3L, 3L), .Label = c("Drosophila melanogaster", "Homo sapiens", "Mus musculus"), class = "factor"), assembly = c("mm9", "mm9", "mm9", "mm9"), bamReads = c("ENCFF001NGB.bam", "ENCFF001MLC.bam", "ENCFF001NGJ.bam", "ENCFF001MLD.bam"), Peaks = c(NA, "ENCFF001MLH.bed", NA, "ENCFF001MLI.bed"), lab_short = list("Stanford", "PennState", "Stanford", "PennState"), SampleID = c("CTCF_CH12.LX_Stanford_1", "CTCF_CH12.LX_PennState_1", "CTCF_CH12.LX_Stanford_2", "CTCF_CH12.LX_PennState_2" )), class = "data.frame", .Names = c("Factor", "assay", "Tissue", "Replicate", "lab", "organism", "assembly", "bamReads", "Peaks", "lab_short", "SampleID"), row.names = c(NA, -4L)) qc_res <- ChIPQC(samplesheet, annotation = "mm9", chromosome = "chr1", blacklist = mm9_blacklist) # CTCF_CH12.LX_Stanford_1 CH12.LX CTCF 1 bed # CTCF_CH12.LX_PennState_1 CH12.LX CTCF 1 bed # CTCF_CH12.LX_Stanford_2 CH12.LX CTCF 2 bed # CTCF_CH12.LX_PennState_2 CH12.LX CTCF 2 bed # Error in if (res >= minval) { : missing value where TRUE/FALSE needed # only ones with peaks available qc_res <- ChIPQC(samplesheet[!is.na(samplesheet$Peaks),], samples=qc_res_list[!is.na(samplesheet$Peaks)]) # CTCF_CH12.LX_Stanford_1 CH12.LX CTCF 1 bed # CTCF_CH12.LX_PennState_1 CH12.LX CTCF 1 bed # CTCF_CH12.LX_Stanford_2 CH12.LX CTCF 2 bed # CTCF_CH12.LX_PennState_2 CH12.LX CTCF 2 bed #Error in if (res >= minval) { : missing value where TRUE/FALSE needed # traceback() # 6: FUN(newX[, i], ...) # 5: apply(pv$allvectors[, 4:ncol(pv$allvectors)], 1, pv.minOverlap, # minOverlap) # 4: pv.vectors(model, mask = mask, minOverlap = minOverlap, bKeepAll = bKeepAll, # bAnalysis = bAnalysis, attributes = attributes) # 3: pv.model(DBA, mask = mask, minOverlap = minOverlap, samplesheet = sampleSheet, # config = config, caller = peakCaller, format = peakFormat, # scorecol = scoreCol, bLowerBetter = bLowerScoreBetter, skipLines = skipLines, # bAddCallerConsensus = bAddCallerConsensus, bRemoveM = bRemoveM, # bRemoveRandom = bRemoveRandom, bKeepAll = TRUE, bAnalysis = TRUE, # filter = filter, attributes = attributes) # 2: dba(sampleSheet = experiment, bCorPlot = FALSE, peakCaller = "bed") # 1: ChIPQC(samplesheet, annotation = "mm9", chromosome = "chr1", # blacklist = mm9_blacklist) sessionInfo() # R version 3.2.3 (2015-12-10) # Platform: x86_64-pc-linux-gnu (64-bit) # Running under: Debian GNU/Linux stretch/sid # # locale: # [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=en_GB.UTF-8 # [4] LC_COLLATE=en_GB.UTF-8 LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8 # [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C LC_ADDRESS=C # [10] LC_TELEPHONE=C LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C # # attached base packages: # [1] stats4 parallel stats graphics grDevices utils datasets methods base # # other attached packages: # [1] TxDb.Mmusculus.UCSC.mm9.knownGene_3.2.2 GenomicFeatures_1.22.13 # [3] AnnotationDbi_1.32.3 dplyr_0.4.3 # [5] tidyr_0.4.0 ChIPQC_1.6.1 # [7] DiffBind_1.16.3 RSQLite_1.0.0 # [9] DBI_0.3.1 locfit_1.5-9.1 # [11] GenomicAlignments_1.6.3 Rsamtools_1.22.0 # [13] Biostrings_2.38.4 XVector_0.10.0 # [15] limma_3.26.8 SummarizedExperiment_1.0.2 # [17] Biobase_2.30.0 GenomicRanges_1.22.4 # [19] GenomeInfoDb_1.6.3 IRanges_2.4.8 # [21] S4Vectors_0.8.11 BiocGenerics_0.16.1 # [23] ggplot2_2.0.0 ENCODExplorer_1.2.1 # [25] BiocInstaller_1.20.1 # # loaded via a namespace (and not attached): # [1] edgeR_3.12.0 jsonlite_0.9.19 splines_3.2.3 gtools_3.5.0 # [5] assertthat_0.1 highr_0.5.1 latticeExtra_0.6-28 amap_0.8-14 # [9] RBGL_1.46.0 Category_2.36.0 backports_1.0.0 lattice_0.20-33 # [13] digest_0.6.9 RColorBrewer_1.1-2 checkmate_1.7.2 colorspace_1.2-6 # [17] Matrix_1.2-3 plyr_1.8.3 GSEABase_1.32.0 chipseq_1.20.0 # [21] XML_3.98-1.3 pheatmap_1.0.8 ShortRead_1.28.0 biomaRt_2.26.1 # [25] genefilter_1.52.1 zlibbioc_1.16.0 xtable_1.8-2 GO.db_3.2.2 # [29] scales_0.4.0 brew_1.0-6 gdata_2.17.0 BiocParallel_1.4.3 # [33] annotate_1.48.0 lazyeval_0.1.10 survival_2.38-3 magrittr_1.5 # [37] systemPipeR_1.4.8 fail_1.3 gplots_2.17.0 hwriter_1.3.2 # [41] GOstats_2.36.0 graph_1.48.0 tools_3.2.3 BBmisc_1.9 # [45] stringr_1.0.0 sendmailR_1.2-1 munsell_0.4.3 lambda.r_1.1.7 # [49] caTools_1.17.1 futile.logger_1.4.1 grid_3.2.3 RCurl_1.95-4.7 # [53] rjson_0.2.15 AnnotationForge_1.12.2 bitops_1.0-6 base64enc_0.1-3 # [57] gtable_0.2.0 R6_2.1.2 reshape2_1.4.1 Nozzle.R1_1.1-1 # [61] knitr_1.12.3 rtracklayer_1.30.2 futile.options_1.0.0 KernSmooth_2.23-15 # [65] stringi_1.0-1 BatchJobs_1.6 Rcpp_0.12.3 # Code for downloading data and preparing samplesheet: library(ENCODExplorer) library(ChIPQC) ## Get data biosamples <- c("CH12.LX") targets <- c("CTCF") formats <- c("bam", "bigBed") res <- queryEncode(assay = "ChIP-seq", organism = "Mus musculus", fixed = FALSE, biosample = paste(biosamples, collapse = "|"), target = paste(targets, collapse = "|"), file_format = paste(formats, collapse = "|")) #pretty table cols <- c("target","assay", "biosample_name", "biological_replicate_number", "file_format","lab", "organism", "assembly", "file_accession") knitr::kable(res$experiment[, cols]) #download data downloadEncode(resultSet = res, resultOrigin = "queryEncode", dir = ".") ## Make samplesheet, etc samplesheet <- res$experiment[, cols] %>% spread(key = file_format, value = file_accession) %>% #rename for ChIPQC conventions rename(bamReads = bam, Peaks = bigBed, Tissue = biosample_name, Factor = target, Replicate = biological_replicate_number) %>% mutate(bamReads = paste0(bamReads, ".bam"), Peaks = ifelse(is.na(Peaks), NA, paste0(Peaks, ".bed")), Factor = gsub("-mouse", "", Factor)) %>% mutate(lab_short = lapply(lab, function(x) strsplit(x, ", ")[[1]][2])) %>% #just institution mutate(SampleID = paste(Factor, Tissue, lab_short, Replicate, sep = "_")) #make Sample ID bed_files <- unique(samplesheet$Peaks[!is.na(samplesheet$Peaks)]) for (bf in bed_files){ bb <- sub(".bed", ".bigBed", bf) message("converting file: ", bb) system(paste("bigBedToBed", bb, bf, sep = " ")) } mm9_blacklist <- makeGRangesFromDataFrame(read.table("mm9-blacklist.bed", col.names = c("chr", "start", "end")))