Hello BioC community,
We're trying to map SNPs to the nearest genes. We find we're getting different mappings depending on the version of GenomicRanges used, even when the input files are the same. I've pasted our code below. I looked at 3 of the discrepant SNPs using the UCSC genome browser (to visually confirm which gene was infact closer). It seemed to me that one version mapped the SNP to the nearest gene and the other seemed to somehow be strand aware (ie it mapped it to another gene that was in the vicinity but happened to be oriented opposite to the true nearest gene).
EDIT:
---
An example. Coordinates are in hg19 (although I don't think that's relevant).
GenomicRanges 1.30.0: rs2298217 C1orf159 8242
GenomicRanges 1.30.3: rs2298217 LINC01342 2417
I just tested with v 1.22.4 and the SNP is mapped to LINC01342 there as well, and is the desired mapping.
---
Is there something in my usage of the functions that was working before but perhaps, with newer versions of GRanges, elicits a different behaviour?
Thank you,
Shraddha
----
Shraddha Pai
Post-doctoral Fellow, http://baderlab.org
# get the gene inclusion region
# gFile - refGene file with header
# marg: (integer) region upstream and downstream of [txStart,txEnd] that
# is considered within the domain of a gene
.getGeneIncRanges <- function(gFile, marg=5000) {
refGene <- read.delim(gFile,sep="\t",header=TRUE,as.is=TRUE)
refGene_GR <- GRanges(refGene[,"chrom"],
IRanges(refGene[,"txStart"]+1 - marg,refGene[,"txEnd"]+marg),
strand=refGene[,"strand"],name=refGene[,"name2"])
return(refGene_GR)
}
# ----------------------------------------------
# Work begins
# ----------------------------------------------
cat("* Reading SNP table\n")
t0 <- system.time(snps <- read.table(snpFile,header=TRUE,as.is=TRUE))
print(t0)
# note: this is unstranded.
snp_GR <- GRanges(paste("chr", snps$CHR,sep=""),
IRanges(snps$BP,snps$BP),
name=snps$SNP)
gene_GR <- .getGeneIncRanges(geneFile,marg=5000); # [TSS,TES]+/- 5kb domain
start_GR <- resize(gene_GR,fix="start",width=1L)
end_GR <- resize(gene_GR,fix="end",width=1L)
cat("* Computing distances\n")
d0 <- distanceToNearest(snp_GR, gene_GR)
dbit <- d0@elementMetadata@listData$distance
d_in <- data.frame(as.matrix(d0),dbit)
# snps inside the gene domain
idx <- which(dbit ==0)
out1 <- cbind(snp_GR$name[d_in$queryHits[idx]],
gene_GR$name[d_in$subjectHits[idx]],
d_in$dbit[idx])
# snps outside gene domain - compute distance to start and end of domain
# and keep whichever one is less
idx <- which(dbit > 0) # not in a gene
snp2_GR <- snp_GR[idx]
cat(sprintf("%i SNPs not inside gene domain\n", length(idx)))
cat("* Computing distance to domain starts\n")
d1 <- distanceToNearest(snp2_GR,start_GR)
dbit<- d1@elementMetadata@listData$distance
d_start <- cbind(as.matrix(d1),dbit)
cat("* Computing distance to domain ends\n")
d2 <- distanceToNearest(snp2_GR,end_GR)
dbit<- d2@elementMetadata@listData$distance
d_end <- cbind(as.matrix(d2),dbit)
d_both <- merge(d_start,d_end, by.x="queryHits",by.y="queryHits")
# start is closer than end
idx <- which(d_both$dbit.x < d_both$dbit.y);
out2 <- cbind(snp2_GR$name[d_both$queryHits[idx]],
start_GR$name[d_both$subjectHits.x[idx]],
d_both$dbit.x[idx]
)
idx <- setdiff(1:nrow(d_both),idx)
out3 <- cbind(snp2_GR$name[d_both$queryHits[idx]],
end_GR$name[d_both$subjectHits.y[idx]],
d_both$dbit.y[idx]
)
cat("* Writing to output file\n")
options(scipen=10) #don't convert numbers to sci notation
write.table(out1,file=outFile,sep="\t",col=F,row=F,quote=F)
write.table(out2,file=outFile,sep="\t",col=F,row=F,quote=F,append=T)
write.table(out3,file=outFile,sep="\t",col=F,row=F,quote=F,append=T)<font face="sans-serif, Arial, Verdana, Trebuchet MS">
</font>