Canonical transcripts Mus musculus
1
@ramonmassoni-19642
Last seen 8 months ago
Spain
Hi!
I have a set of target peaks that I would like to link to promoter regions from the mm10 genome annotation. I retrieve the promoter regions and subset them to standard chromosomes as follows:
library( GenomicRanges)
library( GenomicFeatures)
library( TxDb.Mmusculus.UCSC.mm10.knownGene)
promoters <- promoters( TxDb.Mmusculus.UCSC.mm10.knownGene)
promoters <- promoters[ seqnames( promoters) %in% paste0( "chr" , c( 1 : 19 , "X" , "Y" ) ) ]
This gives me the following GenomicRanges object
GRanges object with 142314 ranges and 2 metadata columns:
seqnames ranges strand | tx_id tx_name
< Rle> < IRanges> < Rle> | < integer> < character>
ENSMUST00000193812.1 chr1 3071253 - 3073452 + | 1 ENSMUST00000193812.1
ENSMUST00000082908.1 chr1 3100016 - 3102215 + | 2 ENSMUST00000082908.1
ENSMUST00000192857.1 chr1 3250757 - 3252956 + | 3 ENSMUST00000192857.1
ENSMUST00000161581.1 chr1 3464587 - 3466786 + | 4 ENSMUST00000161581.1
ENSMUST00000192183.1 chr1 3529795 - 3531994 + | 5 ENSMUST00000192183.1
... ... ... ... . ... ...
ENSMUST00000187582.6 chrY 90667426 - 90669625 - | 142310 ENSMUST00000187582.6
ENSMUST00000191048.1 chrY 90667426 - 90669625 - | 142311 ENSMUST00000191048.1
ENSMUST00000238676.1 chrY 90755268 - 90757467 - | 142312 ENSMUST00000238676.1
ENSMUST00000177893.1 chrY 90754622 - 90756821 - | 142313 ENSMUST00000177893.1
ENSMUST00000179623.1 chrY 90838978 - 90841177 - | 142314 ENSMUST00000179623.1
- - - - - - -
seqinfo: 66 sequences ( 1 circular) from mm10 genome
I would like to simplify it to obtain just a canonical transcript per gene, and include the gene symbol as an extra column in the metadata.
Could you please help me figure out how to do so? Thank you!!
TxDb.Mmusculus.UCSC.mm9.knownGene
ensembldb
GenomicFeatures
Mus_musculus
• 671 views
@james-w-macdonald-5106
Last seen 7 hours ago
United States
Ensembl makes claims as to canonical transcripts, so we can just believe them, I imagine.
> library( Mus.musculus)
> library( TxDb.Mmusculus.UCSC.mm39.knownGene)
> TxDb( Mus.musculus) < - TxDb.Mmusculus.UCSC.mm39.knownGene
> z < - promoters( TxDb.Mmusculus.UCSC.mm39.knownGene)
Warning message:
In valid.GenomicRanges.seqinfo( x, suggest.trim = TRUE) :
GRanges object contains 2
out-of-bound ranges located on
sequences chr4_JH584295v1_random
and chr5_JH584296v1_random. Note
that ranges located on a
sequence whose length is unknown
( NA) or on a circular sequence
are not considered out-of-bound
( use seqlengths( ) and
isCircular( ) to get the lengths
and circularity flags of the
underlying sequences) . You can
use trim( ) to trim these ranges.
See ?` trim,GenomicRanges-method`
for more information.
> z < - keepStandardChromosomes( z, pruning.mode = "coarse" )
> z$symbol < - mapIds( Mus.musculus, names( z) , "SYMBOL" , "TXNAME" )
'select()' returned 1:1 mapping
between keys and columns
> sum( is.na( z$symbol )) /length( z)
[ 1] 0.2246602
That gives us the symbols. Now let's get canonical.
> library( biomaRt)
> mart < - useEnsembl( "ensembl" , "mmusculus_gene_ensembl" )
> zz < - getBM( c( "ensembl_transcript_id_version" , "transcript_is_canonical" ) , "ensembl_transcript_id_version" , names( z) , mart)
> zzcon < - subset( zz, transcript_is_canonical == 1L)
> zcon < - z[ zzcon[ ,1] ]
> zcon
GRanges object with 57078 ranges and 3 metadata columns:
seqnames
< Rle>
ENSMUST00000194081.2 chr1
ENSMUST00000194393.2 chr1
ENSMUST00000194605.2 chr1
ENSMUST00000191703.2 chr1
ENSMUST00000191467.2 chr1
.. . .. .
ENSMUST00000181996.2 chrX
ENSMUST00000127786.4 chrX
ENSMUST00020181898.1 chrX
ENSMUST00020182793.1 chrX
ENSMUST00000144563.2 chrX
ranges
< IRanges>
ENSMUST00000194081.2 108342807-108345006
ENSMUST00000194393.2 6978784-6980983
ENSMUST00000194605.2 6984783-6986982
ENSMUST00000191703.2 6997983-7000182
ENSMUST00000191467.2 108695865-108698064
.. . .. .
ENSMUST00000181996.2 102426576-102428775
ENSMUST00000127786.4 102526661-102528860
ENSMUST00020181898.1 102504692-102506891
ENSMUST00020182793.1 102513203-102515402
ENSMUST00000144563.2 102569315-102571514
strand |
< Rle> |
ENSMUST00000194081.2 + |
ENSMUST00000194393.2 + |
ENSMUST00000194605.2 + |
ENSMUST00000191703.2 + |
ENSMUST00000191467.2 + |
.. . .. . .
ENSMUST00000181996.2 - |
ENSMUST00000127786.4 - |
ENSMUST00020181898.1 - |
ENSMUST00020182793.1 - |
ENSMUST00000144563.2 - |
tx_id
< integer>
ENSMUST00000194081.2 2417
ENSMUST00000194393.2 75
ENSMUST00000194605.2 76
ENSMUST00000191703.2 77
ENSMUST00000191467.2 2418
.. . .. .
ENSMUST00000181996.2 146163
ENSMUST00000127786.4 146164
ENSMUST00020181898.1 146166
ENSMUST00020182793.1 146167
ENSMUST00000144563.2 146168
tx_name
< character>
ENSMUST00000194081.2 ENSMUST00000194081.2
ENSMUST00000194393.2 ENSMUST00000194393.2
ENSMUST00000194605.2 ENSMUST00000194605.2
ENSMUST00000191703.2 ENSMUST00000191703.2
ENSMUST00000191467.2 ENSMUST00000191467.2
.. . .. .
ENSMUST00000181996.2 ENSMUST00000181996.2
ENSMUST00000127786.4 ENSMUST00000127786.4
ENSMUST00020181898.1 ENSMUST00020181898.1
ENSMUST00020182793.1 ENSMUST00020182793.1
ENSMUST00000144563.2 ENSMUST00000144563.2
symbol
< character>
ENSMUST00000194081.2 < NA>
ENSMUST00000194393.2 < NA>
ENSMUST00000194605.2 < NA>
ENSMUST00000191703.2 < NA>
ENSMUST00000191467.2 < NA>
.. . .. .
ENSMUST00000181996.2 < NA>
ENSMUST00000127786.4 Xist
ENSMUST00020181898.1 < NA>
ENSMUST00020182793.1 < NA>
ENSMUST00000144563.2 < NA>
-------
seqinfo: 22 sequences ( 1 circular) from mm39 genome
Login before adding your answer.
Traffic: 884 users visited in the last hour