Question

Log of Step 1

0

Entering edit mode

Anushka • 0

@4a6b0437

Last seen 2.9 years ago

United States

The log function seems to not be working correctly when ramwas1scanBams(param) is called. This code snippet might be a good way to correct that.

# ## ----scan-bams, warning=FALSE, message=FALSE----------------------------------
# # Scan a BAM file into Rbam object
# bam.scanBamFile = function(bamfilename, scoretag = "MAPQ", minscore = 4){
#   fields = c("qname", "rname", "pos", "cigar", "flag");
#   # "qname" is read name, "rname" is chromosome
#   tags = "NM";# character();
#   
#   # Open the BAM file
#   {
#     if(nchar(scoretag) == 2){
#       scoretag = toupper(scoretag);
#       tags = c(tags, scoretag);
#     } else {
#       scoretag = tolower(scoretag);
#       fields = c(fields,scoretag);
#     }
#     
#     flag = scanBamFlag(isUnmappedQuery=NA, isSecondMateRead=FALSE);
#     param = ScanBamParam(flag=flag, what=fields, tag=tags);
#     bf = BamFile(bamfilename, yieldSize=1e6) ## typically, yieldSize=1e6
#     open(bf);
#     rm(fields, tags, flag);
#   } # bf, param
#   
#   qc = list(nbams = 1L);
#   
#   startlistfwd = NULL;
#   repeat{
#     ### Read "yieldSize" rows
#     bb = scanBam(file=bf, param=param)[[1]];
#     if( length(bb[[1]])==0 )
#       break;
#     
#     ### Put tags in the main list
#     bb = c(bb[names(bb) != "tag"], bb$tag);
#     
#     # stopifnot( length(bb[[scoretag]]) == length(bb[[1]]) )
#     
#     ### Create output lists
#     if(is.null(startlistfwd)){
#       startlistfwd = vector("list",length(levels(bb$rname)));
#       startlistrev = vector("list",length(levels(bb$rname)));
#       names(startlistfwd) = levels(bb$rname);
#       names(startlistrev) = levels(bb$rname);
#       startlistfwd[] = list(list())
#       startlistrev[] = list(list())
#     } # startlistfwd, startlistrev
#     
#     ### Keep only primary reads
#     {
#       keep = bitwAnd(bb$flag, 256L) == 0L;
#       if(!all(keep))
#         bb = lapply(bb,`[`,which(keep));
#       rm(keep);
#     }
#     
#     qc$reads.total = qc$reads.total %add% length(bb[[1]]);
#     
#     ### Keep only aligned reads
#     {
#       keep = bitwAnd(bb$flag, 4L) == 0L;
#       if(!all(keep))
#         bb = lapply(bb,`[`,which(keep));
#       rm(keep);
#     }
#     
#     if(length(bb[[1]])==0){
#       message(sprintf("Recorded %.f of %.f reads",
#                       qc$reads.recorded,qc$reads.total));
#       next;
#     }
#     
#     bb$matchedAlongQuerySpace =
#       cigarWidthAlongQuerySpace(bb$cigar,after.soft.clipping = TRUE);
#     
#     qc$reads.aligned =
#       qc$reads.aligned %add% length(bb[[1]]);
#     qc$bf.hist.score1 =
#       qc$bf.hist.score1 %add% tabulate(pmax(bb[[scoretag]]+1L,1L));
#     qc$bf.hist.edit.dist1 =
#       qc$bf.hist.edit.dist1 %add% tabulate(bb$NM+1L);
#     qc$bf.hist.length.matched =
#       qc$bf.hist.length.matched %add% tabulate(bb$matchedAlongQuerySpace);
#     
#     ### Keep score >= minscore
#     if( !is.null(minscore) ){
#       score = bb[[scoretag]];
#       keep = score >= minscore;
#       keep[is.na(keep)] = FALSE;
#       if(!all(keep))
#         bb = lapply(bb,`[`,which(keep));
#       rm(keep);
#     }
#     
#     qc$reads.recorded =
#       qc$reads.recorded %add% length(bb[[1]]);
#     qc$hist.score1 =
#       qc$hist.score1 %add% tabulate(pmax(bb[[scoretag]]+1L,1L));
#     qc$hist.edit.dist1 =
#       qc$hist.edit.dist1 %add% tabulate(bb$NM+1L);
#     qc$hist.length.matched =
#       qc$hist.length.matched %add% tabulate(bb$matchedAlongQuerySpace);
#     
#     ### Forward vs. Reverse strand
#     bb$isReverse = bitwAnd(bb$flag, 0x10) > 0;
#     qc$frwrev = qc$frwrev %add% tabulate(bb$isReverse + 1L)
#     
#     
#     ### Read start positions (accounting for direction)
#     {
#       bb$startpos = bb$pos;
#       bb$startpos[bb$isReverse] = bb$startpos[bb$isReverse] +
#         (cigarWidthAlongReferenceSpace(bb$cigar[bb$isReverse])-1L) - 1L;
#       # Last -1L is for shift from C on reverse strand to C on the forward
#     } # startpos
#     
#     ### Split and store the start locations
#     {
#       offset = length(startlistfwd);
#       split.levels = as.integer(bb$rname) + offset*bb$isReverse;
#       levels(split.levels) = c(
#         names(startlistfwd),
#         paste0(names(startlistfwd),"-"));
#       class(split.levels) = "factor";
#       splt = split( bb$startpos, split.levels, drop = FALSE);
#       # print(vapply(splt,length,0))
#       for( i in seq_along(startlistfwd) ){
#         if( length(splt[i]) > 0 ){
#           startlistfwd[[i]][[length(startlistfwd[[i]])+1L]] =
#             splt[[i]];
#         }
#         if( length(splt[i+offset]) > 0 ){
#           startlistrev[[i]][[length(startlistrev[[i]])+1L]] =
#             splt[[i+offset]];
#         }
#       }
#       rm(offset, split.levels, splt);
#     } # startlistfwd, startlistrev
#     message(sprintf("Recorded %.f of %.f reads",
#                     qc$reads.recorded,qc$reads.total));
#   }
#   close(bf);
#   rm(bf); # , oldtail
#   
#   startsfwd = startlistfwd;
#   startsrev = startlistrev;
#   
#   ### combine and sort lists in "outlist"
#   for( i in seq_along(startlistfwd) ){
#     startsfwd[[i]] = sort.int(unlist(startlistfwd[[i]]));
#     startsrev[[i]] = sort.int(unlist(startlistrev[[i]]));
#   }
#   gc();
#   
#   if( !is.null(qc$hist.score1))
#     class(qc$hist.score1) = "qcHistScore";
#   if( !is.null(qc$bf.hist.score1))
#     class(qc$bf.hist.score1) = "qcHistScoreBF";
#   if( !is.null(qc$hist.edit.dist1))
#     class(qc$hist.edit.dist1) = "qcEditDist";
#   if( !is.null(qc$bf.hist.edit.dist1))
#     class(qc$bf.hist.edit.dist1) = "qcEditDistBF";
#   if( !is.null(qc$hist.length.matched))
#     class(qc$hist.length.matched) = "qcLengthMatched";
#   if( !is.null(qc$bf.hist.length.matched))
#     class(qc$bf.hist.length.matched) = "qcLengthMatchedBF";
#   if( !is.null(qc$frwrev) )
#     class(qc$frwrev) = "qcFrwrev";
#   
#   info = list(bamname = bamfilename,
#               scoretag = scoretag,
#               minscore = minscore,
#               filesize = file.size(bamfilename));
#   rbam = list(startsfwd = startsfwd,
#               startsrev = startsrev,
#               qc = qc,
#               info = info);
#   return( rbam );
# }
# 
# # Make QC plots for an Rbam
# pipelineSaveQCplots = function(param, rbam, bamname){
#   filename = sprintf("%s/score/hs_%s.pdf", param$dirqc, bamname);
#   dir.create(dirname(filename), showWarnings = FALSE, recursive = TRUE);
#   pdf(filename);
#   plot(rbam$qc$hist.score1, samplename = bamname);
#   plot(rbam$qc$bf.hist.score1, samplename = bamname);
#   dev.off();
#   rm(filename);
#   
#   filename = sprintf("%s/edit_distance/ed_%s.pdf", param$dirqc, bamname);
#   dir.create(dirname(filename), showWarnings = FALSE, recursive = TRUE);
#   pdf(filename);
#   plot(rbam$qc$hist.edit.dist1, samplename = bamname);
#   plot(rbam$qc$bf.hist.edit.dist1, samplename = bamname);
#   dev.off();
#   rm(filename);
#   
#   filename = sprintf("%s/matched_length/ml_%s.pdf", param$dirqc, bamname);
#   dir.create(dirname(filename), showWarnings = FALSE, recursive = TRUE);
#   pdf(filename);
#   plot(rbam$qc$hist.length.matched, samplename = bamname);
#   plot(rbam$qc$bf.hist.length.matched, samplename = bamname);
#   dev.off();
#   rm(filename);
#   
#   if( !is.null(rbam$qc$hist.isolated.dist1) ){
#     filename = sprintf("%s/isolated_distance/id_%s.pdf",
#                        param$dirqc, bamname);
#     dir.create(dirname(filename), showWarnings = FALSE, recursive = TRUE);
#     pdf(filename);
#     plot(rbam$qc$hist.isolated.dist1, samplename = bamname);
#     dev.off();
#     rm(filename);
#   }
#   if( !is.null(rbam$qc$avg.coverage.by.density) ){
#     filename = sprintf("%s/isolated_distance/cbd_%s.pdf",
#                        param$dirqc, bamname);
#     dir.create(dirname(filename), showWarnings = FALSE, recursive = TRUE);
#     pdf(filename);
#     plot(rbam$qc$avg.coverage.by.density, samplename = bamname);
#     dev.off();
#     rm(filename);
#   }
# }
# 
# # Scan a BAM, calculate QCs, generate plots
# pipelineProcessBam = function(bamname, param){
#   # Used parameters: scoretag, minscore, filecpgset, maxrepeats
#   
#   param = parameterPreprocess(param);
#   
#   if( !is.null(param$filecpgset) && is.null(param$maxfragmentsize) )
#     return("Parameter not set: maxfragmentsize");
#   
#   bamname = gsub("\\.bam$", "", bamname, ignore.case = TRUE);
#   bamfullname = makefullpath(param$dirbam, paste0(bamname,".bam"))
#   
#   dir.create(param$dirrbam, showWarnings = FALSE, recursive = TRUE)
#   dir.create(param$dirrqc,  showWarnings = FALSE, recursive = TRUE)
#   
#   rdsbmfile = paste0(param$dirrbam, "/", basename(bamname), ".rbam.rds");
#   rdsqcfile = paste0(param$dirrqc,  "/", basename(bamname), ".qc.rds");
#   
#   savebam = TRUE;
#   rbam = NULL;
#   if( file.exists(rdsqcfile) & file.exists(rdsbmfile) ){
#     if( param$recalculate.QCs ){
#       ### Precache the input rds file
#       rbam = readRDS(rdsbmfile);
#       savebam = FALSE;
#     } else {
#       if( !file.exists( bamfullname ) )
#         return(paste0("Rbam rds file already exists",
#                       " (no bam): ", rdsqcfile));
#       if( file.mtime(rdsbmfile) > file.mtime(bamfullname) )
#         return(paste0("Rbam rds file already exists",
#                       " (newer than bam): ", rdsqcfile));
#     }
#   }
#   if( is.null(rbam) ){
#     if( !file.exists( bamfullname ) )
#       return(paste0("Bam file does not exist: ", bamfullname));
#     rbam = bam.scanBamFile(
#       bamfilename = bamfullname,
#       scoretag = param$scoretag,
#       minscore = param$minscore);
#   }
#   
#   rbam2 = bam.removeRepeats(rbam, param$maxrepeats);
#   rbam2 = bam.chrXY.qc(rbam2);
#   rbam2$qc$nbams = 1L;
#   
#   if( !is.null(param$filecpgset) ){
#     cpgset = cachedRDSload(param$filecpgset);
#     noncpgset = cachedRDSload(param$filenoncpgset);
#     isocpgset = isocpgSitesFromCpGset(
#       cpgset = cpgset,
#       distance = param$maxfragmentsize);
#     rbam3 = bam.hist.isolated.distances(
#       rbam = rbam2,
#       isocpgset = isocpgset,
#       distance = param$maxfragmentsize);
#     rbam4 = bam.coverage.by.density(
#       rbam = rbam3,
#       cpgset = cpgset,
#       noncpgset = noncpgset,
#       minfragmentsize = param$minfragmentsize,
#       maxfragmentsize = param$maxfragmentsize)
#     rbam5 = bam.count.nonCpG.reads(
#       rbam = rbam4,
#       cpgset = cpgset,
#       distance = param$maxfragmentsize);
#     
#     ### QC plots
#     pipelineSaveQCplots(param, rbam5, basename(bamname));
#     
#   } else {
#     rbam5 = rbam2;
#   }
#   # .qc qcmean(rbam5$qc$chrX.count)
#   # rbam5$qc$chrX.count[1]/rbam5$qc$chrX.count[2]
#   # message(.qcTextLine(rbam5$qc, bamname))
#   
#   if(savebam)
#     saveRDS( object = rbam5, file = rdsbmfile, compress = "xz");
#   rbam6 = rbam5;
#   rbam6$startsfwd = NULL;
#   rbam6$startsrev = NULL;
#   saveRDS( object = rbam6, file = rdsqcfile, compress = "xz");
#   
#   return(NULL);
# }
# 
# # Parallel job function
# .ramwas1scanBamJob = function(bamname, param){
#   ld = param$dirfilter;
#   
#   .log(ld, "%s, Process %06d, Starting BAM: %s",
#        date(), Sys.getpid(), bamname);
#   
#   rez = pipelineProcessBam(bamname = bamname, param = param);
#   
#   .log(ld, "%s, Process %06d, Finished BAM: %s",
#        date(), Sys.getpid(), bamname);
#   return(rez);
# }
# 
# # Step 1 of the pipeline
# ramwas1scanBams = function( param ){
#   param = parameterPreprocess(param);
#   ld = param$dirfilter;
#   
#   # Parameter checks
#   if( is.null(param$bamnames) )
#     stop("BAM names must be specified. ",
#          "See \"filebamlist\" or \"bamnames\" parameter.");
#   
#   if( !dir.exists(param$dirbam) )
#     stop("Directory with BAM files not found: ",
#          param$dirbam, "\n",
#          "See \"dirbam\" parameter");
#   
#   for( nm in param$bamname ){ # nm = param$bamname[1]
#     fn = makefullpath(param$dirbam, paste0(nm, ".bam"));
#     if( !file.exists(fn) )
#       stop("BAM file not found: ", fn);
#   }
#   
#   if( !is.null(param$filecpgset) && is.null(param$maxfragmentsize) )
#     stop("Parameter not set: maxfragmentsize");
#   
#   dir.create(param$dirfilter, showWarnings = FALSE, recursive = TRUE);
#   
#   #   .log(ld, "%s, Start ramwas1scanBams() call", date(), append = FALSE);
#   
#   nthreads = min(param$cputhreads, length(param$bamname));
#   if( nthreads > 1 ){
#     cl = makeCluster(nthreads);
#     on.exit({stopCluster(cl);});
#     logfun = .logErrors(ld, .ramwas1scanBamJob);
#     z = clusterApplyLB(
#       cl = cl,
#       x = param$bamnames,
#       fun = logfun,
#       param = param);
#     tmp = sys.on.exit();
#     eval(tmp);
#     rm(tmp);
#     on.exit();
#   } else {
#     z = vector("list", length(param$bamnames));
#     names(z) = param$bamnames;
#     for( i in seq_along(param$bamnames) ){ # i=1
#       z[[i]] = .ramwas1scanBamJob(
#         bamname = param$bamnames[i],
#         param = param);
#     }
#   }
#   .showErrors(z);
#   .log(ld, "%s, Done ramwas1scanBams() call", date());
#   return(invisible(z));
# }

sessionInfo( )

log timer • 764 views

ADD COMMENT • link updated 3.0 years ago by ATpoint ★ 5.0k • written 3.0 years ago by Anushka • 0

0

Entering edit mode

Please see my comment: maxfragmentsize problem

ADD REPLY • link 3.0 years ago ATpoint ★ 5.0k