Question

GenomicRanges method to make the total width of ranges a certain value?

0

Entering edit mode

mike-zx • 0

@06ba88f7

Last seen 17 months ago

Spain

I wish to extract the n positions closest to the end of a GRanges object or in other words, making the total width of the ranges in the object n if we count from the end (fixed to the end). Something similar can be done with the resize method although this one only manipulates the width of each range instead of the width of the whole object.

For example, I have the GRanges:

g <- GenomicRanges::GRanges(
  seqnames = c("chr12", "chr12"),
  ranges = IRanges::IRanges(c(9102084, 9099001), c(9102551, 9099001)),
  strand = c("-", "-")
)
print(g)
GRanges object with 2 ranges and 0 metadata columns:
      seqnames          ranges strand
         <Rle>       <IRanges>  <Rle>
  [1]    chr12 9102084-9102551      -
  [2]    chr12         9099001      -
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

And I wonder if there already exists a function f in GenomicRanges (or combination of functions) which allows me the following result:

n <- 10
g2 <- f(g, width = n, fix = "end")
print(g2)
GRanges object with 2 ranges and 0 metadata columns:
      seqnames          ranges strand
         <Rle>       <IRanges>  <Rle>
  [1]    chr12 9102084-9102092      -
  [2]    chr12         9099001      -
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths
sum(GenomicRanges::width(g2))
10

Note that I wish to work only with the positions defined in the original ranges and not with 9099001:9099010 for instance.

GenomicRanges • 1.1k views

ADD COMMENT • link updated 17 months ago by Malcolm Cook ★ 1.6k • written 17 months ago by mike-zx • 0

score 0 · Answer 1 · 2022-11-07

I'll call the function transcript_tail() because the way I think of your GRanges object is that is contains the exon ranges for a given transcript. Also I'll assume that the exons are ordered by ascending rank w.r.t. the transcript, like those returned by GenomicFeatures::exonsBy(..., by="tx"). Note that transcript_tail() could also be used on a GRanges object containing the CDS ranges for a given transcript, which would be another typical use case for this function.

But it's actually easier to start with transcript_head(), a function that keeps the first n positions in the transcript. So let's start with this one.

transcript_head()

library(GenomicRanges)

transcript_head <- function(exons, n=6)
{
    stopifnot(is(exons, "GRanges"))
    stopifnot(is.numeric(n), length(n) == 1, !is.na(n), n >= 0)
    nexons <- length(exons)
    if (n == 0 || nexons == 0)
        return(exons[integer(0)])
    cum_width <- cumsum(width(exons))  # cumulative width
    if (n >= cum_width[[nexons]])
        return(exons)
    ans_len <- sum(cum_width < n) + 1
    ans <- head(exons, n=ans_len)
    exon_to_clip_is_on_minus_strand <- as.character(strand(ans)[ans_len]) == "-"
    clip <- cum_width[ans_len] - n
    if (!exon_to_clip_is_on_minus_strand) {
        end(ans)[ans_len] <- end(ans)[ans_len] - clip
    } else {
        start(ans)[ans_len] <- start(ans)[ans_len] + clip
    }
    ans
}

Then:

exons <- GRanges(c("chr1:26-50:+", "chr1:71-75:+", "chr1:101-150:+"))
transcript_head(exons, n=20)
transcript_head(exons, n=26)
transcript_head(exons, n=32)

With your GRanges object g:

g <- GRanges(c("chr12:9102084-9102551:-", "chr12:9099001:-"))
transcript_head(g, n=10)
# GRanges object with 1 range and 0 metadata columns:
#       seqnames          ranges strand
#          <Rle>       <IRanges>  <Rle>
#   [1]    chr12 9102542-9102551      -
#   -------
#   seqinfo: 1 sequence from an unspecified genome; no seqlengths

transcript_tail()

transcript_tail <- function(exons, n=6)
{
    stopifnot(is(exons, "GRanges"))
    stopifnot(is.numeric(n), length(n) == 1, !is.na(n), n >= 0)
    nexons <- length(exons)
    if (n == 0 || nexons == 0)
        return(exons[integer(0)])
    cum_width <- cumsum(rev(width(exons)))  # cumulative width
    if (n >= cum_width[[nexons]])
        return(exons)
    ans_len <- sum(cum_width < n) + 1
    ans <- tail(exons, n=ans_len)
    exon_to_clip_is_on_minus_strand <- as.character(strand(ans)[1]) == "-"
    clip <- cum_width[ans_len] - n
    if (!exon_to_clip_is_on_minus_strand) {
        start(ans)[1] <- start(ans)[1] + clip
    } else {
        end(ans)[1] <- end(ans)[1] - clip
    }
    ans
}

Then:

transcript_tail(exons, n=40)
transcript_tail(exons, n=52)
transcript_tail(exons, n=56)

With your GRanges object g:

transcript_tail(g, n=10)
# GRanges object with 2 ranges and 0 metadata columns:
#       seqnames          ranges strand
#          <Rle>       <IRanges>  <Rle>
#   [1]    chr12 9102084-9102092      -
#   [2]    chr12         9099001      -
#   -------
#   seqinfo: 1 sequence from an unspecified genome; no seqlengths

Cheers,

H.

score 0 · Answer 2 · 2022-11-09

My restrictCumWidth might be clearer given that separates the logic of determining the restriction range from actually performing the restriction, which it turns over to the delightful GenomicRanges::restrict.

library(GenomicRanges)
restrictCumWidth<-function(x,cumwidth,fix='start',ignore.strand=TRUE) {
  stopifnot(fix %in% c('start','end'))
  stopifnot(isSorted(x))
  if(! ignore.strand) {
    s<-unique(strand(x))
    stopifnot(1!=length(s)) # what would it mean to have mixed strands???
    if (s=='-') fix<-ifelse(fix=='end','start','end')
  }
  endrev<-function(x) { if (fix=='end') rev(x) else x }
  cw<-cumsum(endrev(width(x)))
  i<-which(cw>=cumwidth)[1]
  if (is.na(i)) x 
  else if (fix=='start')
    restrict(x,min(start(x)),start(x)[i] + (cumwidth - c(0,cw)[i] - 1))
  else if (fix=='end')
    restrict(x,end(rev(x))[i] - (cumwidth - c(0,cw)[i] - 1),max(end(x)))
}

At a minimum it passes the following AgreesWithHerveTest: (thanks Hervé Pagès )

exons <- GRanges(c("chr1:26-50:+", "chr1:71-75:+", "chr1:101-150:+"))
stopifnot(restrictCumWidth(exons,20,fix='end') == transcript_tail(exons,20))
stopifnot(restrictCumWidth(exons,26,fix='end') == transcript_tail(exons,26))
stopifnot(restrictCumWidth(exons,32,fix='end') == transcript_tail(exons,32))