#

Question

CSAW ready to use R script

0

Entering edit mode

polaxgr ▴ 10

@polaxgr-14480

Last seen 4.6 years ago

Greece

Hello all, I have read 2-3 times this : https://bioconductor.riken.jp/packages/3.0/bioc/vignettes/csaw/inst/doc/csawUserGuide.pdf , csaw user guide. But, my knowledge in R is very limited. I have 6 samples, (0h 2 replicates, 3h 2 replicates, 6h 2 replicates) , i have done alignment with bwa-mem, converted sam to bam, sorted, removed duplicates, kept only uniquely mapped, merged the 2 replicates into 1 bam file ( as the guide says ) and indexed every bam file. Is there any script which i can find somewhere ready-to-use ? i have managed to stucture this from the guide but i dont think its ok.

Loading in data from BAM files.

require(csaw)

data <- windowCounts(bam.files, ext=110)

binned <- windowCounts(bam.files, bin=TRUE, width=10000)

Calculating normalization factors.

normfacs <- normalize(binned)

Filtering out uninteresting regions.

require(edgeR)

keep <- aveLogCPM(asDGEList(data)) >= -1

data <- data[keep,]

Identifying DB windows.

y <- asDGEList(data, norm.factors=normfacs)

y <- estimateDisp(y, design)

fit <- glmQLFit(y, design, robust=TRUE)

results <- glmQLFTest(fit)

Correcting for multiple testing.

merged <- mergeWindows(rowData(data), tol=1000L)

tabcom <- combineTests(merged$id, results$table)

bam.files <- c("es_1.bam", "es_2.bam", "tn_1.bam", "tn_2.bam")

design <- model.matrix(~factor(c('es', 'es', 'tn', 'tn')))

colnames(design) <- c("intercept", "cell.type")

Counting Reads into windows

for histone marks at least 150 fragment length

frag.len <- 150

window.width <- 1

data <- windowCounts(bam.files, ext=frag.len, width=window.width)

head(assay(data))

head(rowData(data))

data$totals

Filtering out low-quality reads

default.param <- readParam()

default.param

Avoiding problematic genomic regions

repeats <- GRanges("chr1", IRanges(3000001, 3002128))

new.param <- readParam(discard=repeats, restrict=c("chr1", "chr10", "chrX"))

demo <- windowCounts(bam.files, ext=frag.len, param=new.param)

head(rowData(demo))

PAIRED-END

petBamFile <- "example-pet.bam"

pet.param <- readParam(max.frag=400, pet="both")

demo <- windowCounts(petBamFile, param=pet.param)

demo$totals

out <- getPETSizes(petBamFile)

frag.sizes <- out$sizes[out$sizes<=800]

hist(frag.sizes, breaks=50, xlab="Fragment sizes (bp)", ylab="Frequency", main="")

abline(v=400, col="red")

c(out$diagnostics, too.large=sum(out$sizes > 400))

rescue.param <- reform(pet.param, rescue.ext=200, rescue.pairs=TRUE)

demo <- windowCounts(petBamFile, param=rescue.param)

demo$totals

#

Estimating the average fragment length

max.delay <- 500

dedup.on <- readParam(dedup=TRUE, minq=50)

x <- correlateReads(bam.files, max.delay, param=dedup.on)

plot(0:max.delay, x, type="l", ylab="CCF", xlab="Delay (bp)")

n <- 10000

dedup.on <- readParam(dedup=TRUE)

h3ac <- correlateReads("h3ac.bam", n, param=dedup.on)

h3k27me3 <- correlateReads("h3k27me3.bam", n, param=dedup.on)

h3k4me2 <- correlateReads("h3k4me2.bam", n, param=dedup.on)

plot(0:n, h3ac, col="blue", ylim=c(0, 0.1), xlim=c(0, 1000),

xlab="Delay (bp)", ylab="CCF", pch=16, type="l", lwd=2)

lines(0:n, h3k27me3, col="red", pch=16, lwd=2)

lines(0:n, h3k4me2, col="forestgreen", pch=16, lwd=2)

legend("topright", col=c("blue", "red", "forestgreen"), c("H3Ac", "H3K27me3", "H3K4me2"), pch=16)

Eliminating composition biases

binned <- windowCounts(bam.files, bin=TRUE, width=10000)

normfacs <- normalize(binned)

normfacs

ab <- aveLogCPM(asDGEList(binned))

keep <- ab <= quantile(ab, p=0.9)

normalize(binned[keep,])

Filtering by global enrichment

bin.size <- 2000L

binned <- windowCounts(bam.files, bin=TRUE, width=bin.size)

scaled <- bin.size/(window.width + frag.len)

scaled

pc <- 0.5

win.ab <- aveLogCPM(asDGEList(data), prior.count=pc)

bin.ab <- aveLogCPM(asDGEList(binned), prior.count=pc*scaled)

bin.ab <- bin.ab - log2(scaled)

threshold <- median(bin.ab) + log2(3)

keep <- abundances >= threshold

sum(keep)

hist(bin.ab, xlab="Adjusted bin log-CPM", breaks=100, main="")

abline(v=threshold, col="red")

Testing for differential Binding

Assign the filtered list back to data ??!! ??

original <- data

data <- demo

Setting up the data

y <- asDGEList(data, norm.factors=normfacs)

design

intercept cell.type

1 1 0

2 1 0

3 1 1

4 1 1

attr(,"assign")

[1] 0 1

attr(,"contrasts")

attr(,"contrasts")$factor(c("es", "es", "tn", "tn"))

[1] "contr.treatment"

Estimating the dispersion

par(mfrow=c(1,2))

y <- estimateDisp(y, design)

o <- order(y$AveLogCPM)

plot(y$AveLogCPM[o], sqrt(y$trended.dispersion[o]), type="l", lwd=2, ylim=c(0, 1), xlab=expression("Ave."~Log[2]~"CPM"), ylab=("Biological coefficient of variation"))

abline(h=0.2, col="red")

fit <- glmQLFit(y, design, robust=TRUE)

plotQLDisp(fit)

yo <- asDGEList(original, norm.factors=normfacs)

yo <- estimateDisp(yo, design)

oo <- order(yo$AveLogCPM)

plot(yo$AveLogCPM[oo], sqrt(yo$trended.dispersion[oo]), type="l", lwd=2, ylim=c(0, max(sqrt(yo$trended))), xlab=expression("Ave."~Log[2]~"CPM"), ylab=("Biological coefficient of variation"))

lines(y$AveLogCPM[o], sqrt(y$trended[o]), lwd=2, col="grey")

legend("topright", c("raw", "filtered"), col=c("black", "grey"), lwd=2)

Testing for db windows

results <- glmQLFTest(fit, contrast=c(0, 1))

head(results$table)

Saving the results to file

ofile <- gzfile("clusters.gz", open="w")

write.table(data.frame(chr=as.character(seqnames(sig.bins)), start=start(sig.bins), end=end(sig.bins), tabcom, anno), file=ofile, row.names=FALSE, quote=FALSE, sep="\t")

close(ofile)

Simple visualization of genomic coverage

cur.region <- GRanges("chr18", IRanges(77806807, 77807165))

extractReads(cur.region, bam.files[1], param=readParam())

lib.sizes <- exp(getOffset(y))

mean.lib <- mean(lib.sizes)

max.depth <- 20 * lib.sizes/mean.lib

require(Gviz)

collected <- list()

for (i in 1:length(bam.files)) {reads <- extractReads(cur.region, bam.files[i])pcov <- as(coverage(reads[strand(reads)=="+"]), "GRanges")ncov <- as(coverage(reads[strand(reads)=="-"]), "GRanges")ptrack <- DataTrack(pcov, type="histogram", lwd=0,fill=rgb(0,0,1,0.4), ylim=c(0, max.depth[i]),name=bam.files[i], col.axis="black", col.title="black")ntrack <- DataTrack(ncov, type="histogram", lwd=0,fill=rgb(1,0,0,0.4), ylim=c(0, max.depth[i]))collected[[i]] <- OverlayTrack(trackList=list(ptrack,ntrack))}

gax <- GenomeAxisTrack(col="black")

plotTracks(c(gax, collected), from=start(cur.region), to=end(cur.region))

edit: sorry i cant seem to fix the format

csaw • 1.8k views

ADD COMMENT • link updated 7.2 years ago by Aaron Lun ★ 29k • written 7.2 years ago by polaxgr ▴ 10

score 1 · Answer 1 · 2018-09-07

We had a family friend who used to be an electrician, and he always used to tell me this:

If you don't know what you're doing, you probably shouldn't be doing it.

This seems particularly pertinent in your case. I recognize parts of the code that you've written in your post (after all, I wrote it), but a lot of it is just not appropriate for your data set. For example, you don't even have the "h3ac.bam" file; the result of normalize goes nowhere; the design matrix is not correct for your experiment; the user's guide that you've read is at least 2 years out of date; and so on. This points to a fundamental lack of understanding that is not easily remedied by one or two posts on this support site.

In my opinion, you have two options. The first is to spend at least a month to get up to speed with R and statistics for genomics - I'm sure there are many resources, tutorials and courses online that you should be able to find. Alternatively, you can collaborate with a local bioinformatician or computational biologist to do your analysis for you. You get your results back quickly (and properly), they get a middle authorship upon write-up, and everyone's happy.