CSAW ready to use R script
1
0
Entering edit mode
polaxgr ▴ 10
@polaxgr-14480
Last seen 3.0 years ago
Greece

Hello all, I have read 2-3 times this : https://bioconductor.riken.jp/packages/3.0/bioc/vignettes/csaw/inst/doc/csawUserGuide.pdf , csaw user guide. But, my knowledge in R is very limited. I have 6 samples, (0h 2 replicates, 3h 2 replicates, 6h 2 replicates) , i have done alignment with bwa-mem, converted sam to bam, sorted, removed duplicates, kept only uniquely mapped, merged the 2 replicates into 1 bam file ( as the guide says ) and indexed every bam file. Is there any script which i can find somewhere ready-to-use ?  i have managed to stucture this from the guide but i dont think its ok.

 

Loading in data from BAM files.

require(csaw)

data <- windowCounts(bam.files, ext=110)

binned <- windowCounts(bam.files, bin=TRUE, width=10000)

Calculating normalization factors.

normfacs <- normalize(binned)

Filtering out uninteresting regions.

require(edgeR)

keep <- aveLogCPM(asDGEList(data)) >= -1

data <- data[keep,]

Identifying DB windows.

y <- asDGEList(data, norm.factors=normfacs)

y <- estimateDisp(y, design)

fit <- glmQLFit(y, design, robust=TRUE)

results <- glmQLFTest(fit)

Correcting for multiple testing.

merged <- mergeWindows(rowData(data), tol=1000L)

tabcom <- combineTests(merged$id, results$table)

bam.files <- c("es_1.bam", "es_2.bam", "tn_1.bam", "tn_2.bam")

design <- model.matrix(~factor(c('es', 'es', 'tn', 'tn')))

colnames(design) <- c("intercept", "cell.type")

Counting Reads into windows

for histone marks at least 150 fragment length

frag.len <- 150

window.width <- 1

data <- windowCounts(bam.files, ext=frag.len, width=window.width)

head(assay(data))

head(rowData(data))

data$totals

Filtering out low-quality reads

default.param <- readParam()

default.param

Avoiding problematic genomic regions

repeats <- GRanges("chr1", IRanges(3000001, 3002128))

new.param <- readParam(discard=repeats, restrict=c("chr1", "chr10", "chrX"))

demo <- windowCounts(bam.files, ext=frag.len, param=new.param)

head(rowData(demo))

PAIRED-END

petBamFile <- "example-pet.bam"

pet.param <- readParam(max.frag=400, pet="both")

demo <- windowCounts(petBamFile, param=pet.param)

demo$totals

out <- getPETSizes(petBamFile)

frag.sizes <- out$sizes[out$sizes<=800]

hist(frag.sizes, breaks=50, xlab="Fragment sizes (bp)", ylab="Frequency", main="")

abline(v=400, col="red")

c(out$diagnostics, too.large=sum(out$sizes > 400))

rescue.param <- reform(pet.param, rescue.ext=200, rescue.pairs=TRUE)

demo <- windowCounts(petBamFile, param=rescue.param)

demo$totals

#

Estimating the average fragment length

max.delay <- 500

dedup.on <- readParam(dedup=TRUE, minq=50)

x <- correlateReads(bam.files, max.delay, param=dedup.on)

plot(0:max.delay, x, type="l", ylab="CCF", xlab="Delay (bp)")

n <- 10000

dedup.on <- readParam(dedup=TRUE)

h3ac <- correlateReads("h3ac.bam", n, param=dedup.on)

h3k27me3 <- correlateReads("h3k27me3.bam", n, param=dedup.on)

h3k4me2 <- correlateReads("h3k4me2.bam", n, param=dedup.on)

plot(0:n, h3ac, col="blue", ylim=c(0, 0.1), xlim=c(0, 1000),

xlab="Delay (bp)", ylab="CCF", pch=16, type="l", lwd=2)

lines(0:n, h3k27me3, col="red", pch=16, lwd=2)

lines(0:n, h3k4me2, col="forestgreen", pch=16, lwd=2)

legend("topright", col=c("blue", "red", "forestgreen"), c("H3Ac", "H3K27me3", "H3K4me2"), pch=16)

Eliminating composition biases

binned <- windowCounts(bam.files, bin=TRUE, width=10000)

normfacs <- normalize(binned)

normfacs

ab <- aveLogCPM(asDGEList(binned))

keep <- ab <= quantile(ab, p=0.9)

normalize(binned[keep,])

Filtering by global enrichment

bin.size <- 2000L

binned <- windowCounts(bam.files, bin=TRUE, width=bin.size)

scaled <- bin.size/(window.width + frag.len)

scaled

pc <- 0.5

win.ab <- aveLogCPM(asDGEList(data), prior.count=pc)

bin.ab <- aveLogCPM(asDGEList(binned), prior.count=pc*scaled)

bin.ab <- bin.ab - log2(scaled)

threshold <- median(bin.ab) + log2(3)

keep <- abundances >= threshold

sum(keep)

hist(bin.ab, xlab="Adjusted bin log-CPM", breaks=100, main="")

abline(v=threshold, col="red")

 

Testing for differential Binding

Assign the filtered list back to data  ??!! ??

original <- data

data <- demo

Setting up the data

y <- asDGEList(data, norm.factors=normfacs)

design

intercept cell.type

1 1 0

2 1 0

3 1 1

4 1 1

attr(,"assign")

[1] 0 1

attr(,"contrasts")

attr(,"contrasts")$factor(c("es", "es", "tn", "tn"))

[1] "contr.treatment"

Estimating the dispersion

par(mfrow=c(1,2))

y <- estimateDisp(y, design)

o <- order(y$AveLogCPM)

plot(y$AveLogCPM[o], sqrt(y$trended.dispersion[o]), type="l", lwd=2, ylim=c(0, 1), xlab=expression("Ave."~Log[2]~"CPM"), ylab=("Biological coefficient of variation")) 

abline(h=0.2, col="red")

fit <- glmQLFit(y, design, robust=TRUE)

plotQLDisp(fit)

yo <- asDGEList(original, norm.factors=normfacs)

yo <- estimateDisp(yo, design)

oo <- order(yo$AveLogCPM)

plot(yo$AveLogCPM[oo], sqrt(yo$trended.dispersion[oo]), type="l", lwd=2, ylim=c(0, max(sqrt(yo$trended))), xlab=expression("Ave."~Log[2]~"CPM"), ylab=("Biological coefficient of variation"))

lines(y$AveLogCPM[o], sqrt(y$trended[o]), lwd=2, col="grey")

legend("topright", c("raw", "filtered"), col=c("black", "grey"), lwd=2)

Testing for db windows

results <- glmQLFTest(fit, contrast=c(0, 1))

head(results$table)

Saving the results to file

ofile <- gzfile("clusters.gz", open="w")

write.table(data.frame(chr=as.character(seqnames(sig.bins)), start=start(sig.bins), end=end(sig.bins), tabcom, anno), file=ofile, row.names=FALSE, quote=FALSE, sep="\t")

close(ofile)

Simple visualization of genomic coverage

cur.region <- GRanges("chr18", IRanges(77806807, 77807165))

extractReads(cur.region, bam.files[1], param=readParam())

lib.sizes <- exp(getOffset(y))

mean.lib <- mean(lib.sizes)

max.depth <- 20 * lib.sizes/mean.lib

require(Gviz)

collected <- list()

for (i in 1:length(bam.files)) {reads <- extractReads(cur.region, bam.files[i])pcov <- as(coverage(reads[strand(reads)=="+"]), "GRanges")ncov <- as(coverage(reads[strand(reads)=="-"]), "GRanges")ptrack <- DataTrack(pcov, type="histogram", lwd=0,fill=rgb(0,0,1,0.4), ylim=c(0, max.depth[i]),name=bam.files[i], col.axis="black", col.title="black")ntrack <- DataTrack(ncov, type="histogram", lwd=0,fill=rgb(1,0,0,0.4), ylim=c(0, max.depth[i]))collected[[i]] <- OverlayTrack(trackList=list(ptrack,ntrack))}

gax <- GenomeAxisTrack(col="black")

plotTracks(c(gax, collected), from=start(cur.region), to=end(cur.region))

 

edit: sorry i cant seem to fix the format

 

 

 

 

csaw • 1.3k views
ADD COMMENT
1
Entering edit mode
Aaron Lun ★ 28k
@alun
Last seen 11 hours ago
The city by the bay

We had a family friend who used to be an electrician, and he always used to tell me this:

If you don't know what you're doing, you probably shouldn't be doing it.

This seems particularly pertinent in your case. I recognize parts of the code that you've written in your post (after all, I wrote it), but a lot of it is just not appropriate for your data set. For example, you don't even have the "h3ac.bam" file; the result of normalize goes nowhere; the design matrix is not correct for your experiment; the user's guide that you've read is at least 2 years out of date; and so on. This points to a fundamental lack of understanding that is not easily remedied by one or two posts on this support site.

In my opinion, you have two options. The first is to spend at least a month to get up to speed with R and statistics for genomics - I'm sure there are many resources, tutorials and courses online that you should be able to find. Alternatively, you can collaborate with a local bioinformatician or computational biologist to do your analysis for you. You get your results back quickly (and properly), they get a middle authorship upon write-up, and everyone's happy.

ADD COMMENT
0
Entering edit mode

i also had a friend who said:  if you dont know what you are doing, maybe its worth the risk and doing it anyway.

 

ADD REPLY
0
Entering edit mode

Yeah, good luck with that.

ADD REPLY

Login before adding your answer.

Traffic: 1026 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6