Entering edit mode
Daniel.Berner@unibas.ch
▴
90
@danielbernerunibasch-4268
Last seen 3.9 years ago
Martin, Steve
Thanks for your suggestions, much appreciated! My core problem is that
the
operations inside the loop are too complex for me to allow
implementing any
function() and apply() combination (there are multiple interleaved
if() lines
etc). and I was not aware of the data.table package, but this also
looks
hard to me, given the many steps performed within the loop.
to be more specific, I want to reduce a number of Illumina short reads
at
genomic loci to diploid or haploid (depending on coverage) consensus
genotypes, by evaluating several test criteria. Below I am copying the
full
code. the input is an alignment in BAM format.
Any suggestion to make this faster is most welcome, as the code takes
more
than a week to run for each individual, and I have to process 96 of
them (I
have multicore, but still)! It seems to me that the rate-limiting step
is the
subsetting; the other manipulations are fast enough to work in such a
primitive loop.
cheers,
Daniel Berner
Zoological Institute
University of Basel
#########################
library(Rsamtools)
rm(list=ls())
M<-"GAAGC" ### specify MID (individual)
lib<-25 ### specify library
# genotyping settings:
foldcov<-3 # a RAD locus is not gennotyped if its total seq coverage
is more than foldcov times the overall mean coverage
(=length(posi)/length(u.chrpos)). eliminates repeats
prop.1.2<-0.7 # a RAD locus is not genotyped if the proportion of the
two dominant haplotypes together is <= prop.1.2, relative to the total
coverage of the locus. eliminates repeats
sum.1.2<-15 # a RAD locus in not genotyped as DIPLOID if the sum of
the coverage of the two dominant haplotypes (or simply the coverage
for monomorphic loci) is less than sum.1.2
het<-1/3 # a diploid locus is called heterozygous if the ratio
count.htp.2nd/count.htp.dominant is greater than het
min<-2 # a haploid locus is called when coverage is at least min (but
lower than sum.1.2)
# infile upload:
#folder<-'/Users/dani/Documents/genome scan/novocraft' #here are the
BAMs
folder<-'C:\\Documents and Settings\\daniel\\Desktop\\temp' #here are
the BAMs
infile<-paste(folder, "/novAl_lib_", lib,"_", M, ".bam", sep="") #
infile path
outfile<-paste(folder, '/', 'lib_', lib, '_', M, ".consensus.txt",
sep="") # outfile path
#outfile<-paste("/Users/dani/Documents/genome\ scan/R_bioconductor/",
'lib_', lib, '/', 'lib_', lib, '_', M, ".consensus.txt",sep="")
param
<-ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE),what=c("rname",
"pos", "seq", "strand"), tag="Z3", reverseComplement=TRUE)
f<-scanBam(infile, param=param)[[1]] #upload for a c. 1GB BAM is c. 1
minute
# extract the relevant data:
chr<-c(as.character(f$rname[grep("-",f$strand,invert=T)]),as.character
(f$rname[grep("-",f$strand,invert=F)]))
posi<-c(f$pos[grep("-",f$strand,invert=T)],f$tag[[1]][grep("-",f$stran
d,invert=F)])
seqs<-as.character(c(narrow(f$seq,start=6,end=94)[grep("-",f$strand,in
vert=T)],narrow(f$seq,start=6,end=94)[grep("-",f$strand,invert=F)]))
chrpos<-paste(chr, posi, sep=' ')
u.chrpos<-sort(unique(chrpos))
length(posi) #total number of alignments
length(u.chrpos) #this gives the number of unique loci
length(posi)/length(u.chrpos) #mean coverage per locus
c.thr<-as.integer(length(posi)/length(u.chrpos)*foldcov) # relative
coverage threshold based on average coverage
# derive the data object to process:
dat<-data.frame(cbind(chrpos, seqs))
# clean the environment and release memory:
rm(f, chr, posi, seqs, chrpos) #no longer used
gc()
# process the dat object while writing to 'cons'
# outfile data structure: chrpos / coverage / X-Y (needed for SNP
calling) / diploid_homoz(N)-OR-diploid_heteroz(Y)-OR-haploid_low(L) /
seq
cons<-NULL
for (i in 1:length(u.chrpos)){
stack<-DNAStringSet(as.character(subset(dat,
chrpos==u.chrpos[i])[,2]))
stack.le<-length(stack)
if(stack.le<=c.thr){
#monomorphic:
if (length(unique(stack))==1){
#well covered:
if(stack.le>=sum.1.2){
gtp.X<-paste(u.chrpos[i], stack.le, "X", "N",
as.character(stack[1]), sep=" ")
gtp.Y<-paste(u.chrpos[i], stack.le, "Y", "N",
as.character(stack[1]), sep=" ")
cons<-rbind(cons, gtp.X, gtp.Y)
}
#poorly covered:
if((stack.le<sum.1.2) &&="" (stack.le="">=min)){
gtp.X<-paste(u.chrpos[i], stack.le, "X", "L",
as.character(stack[1]), sep=" ")
cons<-rbind(cons, gtp.X)
}
}
#polymorphic:
if(length(unique(stack))>1){
cnts<-sort(table(as.character(stack)), decreasing=T)[1:2]
#no repeat issue:
if((sum(cnts)/stack.le)>prop.1.2){
#heterozygote:
if((sum(cnts)>=sum.1.2) && ((cnts[2]/cnts[1])>het)){
gtp.X<-paste(u.chrpos[i], stack.le, "X", "Y",
as.character(names(cnts)[1]), sep=" ")
gtp.Y<-paste(u.chrpos[i], stack.le, "Y", "Y",
as.character(names(cnts)[2]), sep=" ")
cons<-rbind(cons, gtp.X, gtp.Y)
}
#homozygote (minor variant too rare, artifact):
if((sum(cnts)>=sum.1.2) && ((cnts[2]/cnts[1])<=het)){
# minor variant too rare, artifact; hence locus diploid homozygote
gtp.X<-paste(u.chrpos[i], stack.le, "X", "N",
as.character(names(cnts)[1]), sep=" ")
gtp.Y<-paste(u.chrpos[i], stack.le, "Y", "N",
as.character(names(cnts)[1]), sep=" ")
cons<-rbind(cons, gtp.X, gtp.Y)
}
#coverage poor, gtp unclear, haploid call:
if((sum(cnts)<sum.1.2) &&="" (cnts[1]="">=min)){
gtp.X<-paste(u.chrpos[i], stack.le, "X", "L",
as.character(names(cnts)[1]), sep=" ")
cons<-rbind(cons, gtp.X)
}
}
}
}
}
write.table(cons, outfile, row.names=F, col.names=F, quote=F) #write
to individual file
### END
[[alternative HTML version deleted]]