Hello all, I have read 2-3 times this : https://bioconductor.riken.jp/packages/3.0/bioc/vignettes/csaw/inst/doc/csawUserGuide.pdf , csaw user guide. But, my knowledge in R is very limited. I have 6 samples, (0h 2 replicates, 3h 2 replicates, 6h 2 replicates) , i have done alignment with bwa-mem, converted sam to bam, sorted, removed duplicates, kept only uniquely mapped, merged the 2 replicates into 1 bam file ( as the guide says ) and indexed every bam file. Is there any script which i can find somewhere ready-to-use ? i have managed to stucture this from the guide but i dont think its ok.
Loading in data from BAM files.
require(csaw)
data <- windowCounts(bam.files, ext=110)
binned <- windowCounts(bam.files, bin=TRUE, width=10000)
Calculating normalization factors.
normfacs <- normalize(binned)
Filtering out uninteresting regions.
require(edgeR)
keep <- aveLogCPM(asDGEList(data)) >= -1
data <- data[keep,]
Identifying DB windows.
y <- asDGEList(data, norm.factors=normfacs)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design, robust=TRUE)
results <- glmQLFTest(fit)
Correcting for multiple testing.
merged <- mergeWindows(rowData(data), tol=1000L)
tabcom <- combineTests(merged$id, results$table)
bam.files <- c("es_1.bam", "es_2.bam", "tn_1.bam", "tn_2.bam")
design <- model.matrix(~factor(c('es', 'es', 'tn', 'tn')))
colnames(design) <- c("intercept", "cell.type")
Counting Reads into windows
for histone marks at least 150 fragment length
frag.len <- 150
window.width <- 1
data <- windowCounts(bam.files, ext=frag.len, width=window.width)
head(assay(data))
head(rowData(data))
data$totals
Filtering out low-quality reads
default.param <- readParam()
default.param
Avoiding problematic genomic regions
repeats <- GRanges("chr1", IRanges(3000001, 3002128))
new.param <- readParam(discard=repeats, restrict=c("chr1", "chr10", "chrX"))
demo <- windowCounts(bam.files, ext=frag.len, param=new.param)
head(rowData(demo))
PAIRED-END
petBamFile <- "example-pet.bam"
pet.param <- readParam(max.frag=400, pet="both")
demo <- windowCounts(petBamFile, param=pet.param)
demo$totals
out <- getPETSizes(petBamFile)
frag.sizes <- out$sizes[out$sizes<=800]
hist(frag.sizes, breaks=50, xlab="Fragment sizes (bp)", ylab="Frequency", main="")
abline(v=400, col="red")
c(out$diagnostics, too.large=sum(out$sizes > 400))
rescue.param <- reform(pet.param, rescue.ext=200, rescue.pairs=TRUE)
demo <- windowCounts(petBamFile, param=rescue.param)
demo$totals
#
Estimating the average fragment length
max.delay <- 500
dedup.on <- readParam(dedup=TRUE, minq=50)
x <- correlateReads(bam.files, max.delay, param=dedup.on)
plot(0:max.delay, x, type="l", ylab="CCF", xlab="Delay (bp)")
n <- 10000
dedup.on <- readParam(dedup=TRUE)
h3ac <- correlateReads("h3ac.bam", n, param=dedup.on)
h3k27me3 <- correlateReads("h3k27me3.bam", n, param=dedup.on)
h3k4me2 <- correlateReads("h3k4me2.bam", n, param=dedup.on)
plot(0:n, h3ac, col="blue", ylim=c(0, 0.1), xlim=c(0, 1000),
xlab="Delay (bp)", ylab="CCF", pch=16, type="l", lwd=2)
lines(0:n, h3k27me3, col="red", pch=16, lwd=2)
lines(0:n, h3k4me2, col="forestgreen", pch=16, lwd=2)
legend("topright", col=c("blue", "red", "forestgreen"), c("H3Ac", "H3K27me3", "H3K4me2"), pch=16)
Eliminating composition biases
binned <- windowCounts(bam.files, bin=TRUE, width=10000)
normfacs <- normalize(binned)
normfacs
ab <- aveLogCPM(asDGEList(binned))
keep <- ab <= quantile(ab, p=0.9)
normalize(binned[keep,])
Filtering by global enrichment
bin.size <- 2000L
binned <- windowCounts(bam.files, bin=TRUE, width=bin.size)
scaled <- bin.size/(window.width + frag.len)
scaled
pc <- 0.5
win.ab <- aveLogCPM(asDGEList(data), prior.count=pc)
bin.ab <- aveLogCPM(asDGEList(binned), prior.count=pc*scaled)
bin.ab <- bin.ab - log2(scaled)
threshold <- median(bin.ab) + log2(3)
keep <- abundances >= threshold
sum(keep)
hist(bin.ab, xlab="Adjusted bin log-CPM", breaks=100, main="")
abline(v=threshold, col="red")
Testing for differential Binding
Assign the filtered list back to data ??!! ??
original <- data
data <- demo
Setting up the data
y <- asDGEList(data, norm.factors=normfacs)
design
intercept cell.type
1 1 0
2 1 0
3 1 1
4 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$factor(c("es", "es", "tn", "tn"))
[1] "contr.treatment"
Estimating the dispersion
par(mfrow=c(1,2))
y <- estimateDisp(y, design)
o <- order(y$AveLogCPM)
plot(y$AveLogCPM[o], sqrt(y$trended.dispersion[o]), type="l", lwd=2, ylim=c(0, 1), xlab=expression("Ave."~Log[2]~"CPM"), ylab=("Biological coefficient of variation"))
abline(h=0.2, col="red")
fit <- glmQLFit(y, design, robust=TRUE)
plotQLDisp(fit)
yo <- asDGEList(original, norm.factors=normfacs)
yo <- estimateDisp(yo, design)
oo <- order(yo$AveLogCPM)
plot(yo$AveLogCPM[oo], sqrt(yo$trended.dispersion[oo]), type="l", lwd=2, ylim=c(0, max(sqrt(yo$trended))), xlab=expression("Ave."~Log[2]~"CPM"), ylab=("Biological coefficient of variation"))
lines(y$AveLogCPM[o], sqrt(y$trended[o]), lwd=2, col="grey")
legend("topright", c("raw", "filtered"), col=c("black", "grey"), lwd=2)
Testing for db windows
results <- glmQLFTest(fit, contrast=c(0, 1))
head(results$table)
Saving the results to file
ofile <- gzfile("clusters.gz", open="w")
write.table(data.frame(chr=as.character(seqnames(sig.bins)), start=start(sig.bins), end=end(sig.bins), tabcom, anno), file=ofile, row.names=FALSE, quote=FALSE, sep="\t")
close(ofile)
Simple visualization of genomic coverage
cur.region <- GRanges("chr18", IRanges(77806807, 77807165))
extractReads(cur.region, bam.files[1], param=readParam())
lib.sizes <- exp(getOffset(y))
mean.lib <- mean(lib.sizes)
max.depth <- 20 * lib.sizes/mean.lib
require(Gviz)
collected <- list()
for (i in 1:length(bam.files)) {reads <- extractReads(cur.region, bam.files[i])pcov <- as(coverage(reads[strand(reads)=="+"]), "GRanges")ncov <- as(coverage(reads[strand(reads)=="-"]), "GRanges")ptrack <- DataTrack(pcov, type="histogram", lwd=0,fill=rgb(0,0,1,0.4), ylim=c(0, max.depth[i]),name=bam.files[i], col.axis="black", col.title="black")ntrack <- DataTrack(ncov, type="histogram", lwd=0,fill=rgb(1,0,0,0.4), ylim=c(0, max.depth[i]))collected[[i]] <- OverlayTrack(trackList=list(ptrack,ntrack))}
gax <- GenomeAxisTrack(col="black")
plotTracks(c(gax, collected), from=start(cur.region), to=end(cur.region))
edit: sorry i cant seem to fix the format
i also had a friend who said: if you dont know what you are doing, maybe its worth the risk and doing it anyway.
Yeah, good luck with that.