I have the following problem: I want to find the next 5 gene in (chr1_gene) on both side of a interval give from the GRanges (gr3) and generate a dataframe containing chr start end (from gr3) of the interval and i column for each ensembl_transcript_id with eventually NA.
Thanks a lot for suggestions!
df <- data.frame(chrom=c("chr1","chr1"), start=c(5087459, 9995206 ), end=c(5097899, 10015020 ))
gr3 <- as(df, "GRanges")
library(biomaRt)
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
chr1_genes <- getBM(attributes=c('ensembl_gene_id',
'ensembl_transcript_id','hgnc_symbol','chromosome_name','start_position','end_position'), filters =
'chromosome_name', values ="1", mart = ensembl)
head(chr1_gene)
ensembl_gene_id ensembl_transcript_id hgnc_symbol chromosome_name start_position end_position
1 ENSG00000231510 ENST00000443270 1 5086459 5090899
2 ENSG00000162444 ENST00000315901 RBP7 1 9997206 10016020
3 ENSG00000162444 ENST00000294435 RBP7 1 9997206 10016020
4 ENSG00000270171 ENST00000602640 1 7693124 7694844
5 ENSG00000225643 ENST00000412797 1 25581478 25590356
6 ENSG00000116497 ENST00000530710 S100PBP 1 32816767 32858879
I have fond this but is quite slow:
enh25gens <- function(gr, GR){
idxf.1 <- follow(gr, subject = GR)
idxf.2 <- follow(GR[idxf.1],subject = GR)
idxf.3 <- follow(GR[idxf.2],subject = GR)
idxf.4 <- follow(GR[idxf.3],subject = GR)
idxf.5 <- follow(GR[idxf.4],subject = GR)
fol.gene.list <- list()
fol.gene.list <- append( fol.gene.list, list(mcols(GR[idxf.1])$ensembl_gene_id))
fol.gene.list <- append( fol.gene.list, list(mcols(GR[idxf.2])$ensembl_gene_id))
fol.gene.list <- append( fol.gene.list, list(mcols(GR[idxf.3])$ensembl_gene_id))
fol.gene.list <- append( fol.gene.list, list(mcols(GR[idxf.4])$ensembl_gene_id))
fol.gene.list <- append( fol.gene.list, list(mcols(GR[idxf.5])$ensembl_gene_id))
fol.gene <- unlist(fol.gene.list)
idxf.1 <- precede(gr, subject = GR)
idxf.2 <- precede(GR[idxf.1],subject = GR)
idxf.3 <- precede(GR[idxf.2],subject = GR)
idxf.4 <- precede(GR[idxf.3],subject = GR)
idxf.5 <- precede(GR[idxf.4],subject = GR)
pre.gene.list <- list()
pre.gene.list <- append( pre.gene.list, list(mcols(GR[idxf.1])$ensembl_gene_id))
pre.gene.list <- append( pre.gene.list, list(mcols(GR[idxf.2])$ensembl_gene_id))
pre.gene.list <- append( pre.gene.list, list(mcols(GR[idxf.3])$ensembl_gene_id))
pre.gene.list <- append( pre.gene.list, list(mcols(GR[idxf.4])$ensembl_gene_id))
pre.gene.list <- append( pre.gene.list, list(mcols(GR[idxf.5])$ensembl_gene_id))
pre.gene <- unlist(pre.gene.list)
list.enh2gene <- unlist(c(as.data.frame(gr[1]), pre.gene,fol.gene ))
return(list.enh2gene)
}
df <- do.call(rbind.data.frame, lapply(gr3, function(x) enh25gens((x),chr1_genes.GR)) )
colnames(df) <- c("chr", "start", "end", "width", "strand",
"pre1", "pre2", "pre3", "pre4", "pre5", "fol1", "fol2", "fol3", "fol4", "fol5")
It would be nice to have a k-nearest neighbor finder for ranges. I will try to make something tomorrow morning.