Why does BSgenome contain more chromosomes of human and mouse species then they actually have?
3
0
Entering edit mode
@182c3042
Last seen 2.7 years ago
United States

If I load the mouse mm10 genome, the names function gives me back 239 chromosomes:

> library("BSgenome.Mmusculus.UCSC.mm10")
> mouse <- BSgenome.Mmusculus.UCSC.mm10
> names(mouse)
  [1] "chr1"                 "chr2"                 "chr3"                
  [4] "chr4"                 "chr5"                 "chr6"                
  [7] "chr7"                 "chr8"                 "chr9"                
 [10] "chr10"                "chr11"                "chr12"               
 [13] "chr13"                "chr14"                "chr15"               
 [16] "chr16"                "chr17"                "chr18"               
 [19] "chr19"                "chrX"                 "chrY"                
 [22] "chrM"                 "chr1_GL455991_alt"    "chr1_GL455992_alt"   
 [25] "chr1_GL455993_alt"    "chr1_GL456005_alt"    "chr1_JH584315_alt"   
 [28] "chr1_JH584320_alt"    "chr1_JH584321_alt"    "chr1_JH584322_alt"   
 [31] "chr2_GL456024_alt"    "chr3_GL456006_alt"    "chr3_GL456007_alt"   
 [34] "chr3_GL456008_alt"    "chr3_GL456042_alt"    "chr3_GL456044_alt"   
 [37] "chr3_GL456045_alt"    "chr3_GL456048_alt"    "chr3_GL456049_alt"   
 [40] "chr3_JH584323_alt"    "chr4_GL455994_alt"    "chr4_GL456009_alt"   
 [43] "chr4_GL456010_alt"    "chr4_GL456053_alt"    "chr4_GL456064_alt"   
 [46] "chr4_GL456075_alt"    "chr4_GL456076_alt"    "chr4_GL456077_alt"   
 [49] "chr4_JH584268_alt"    "chr4_JH584269_alt"    "chr4_JH584324_alt"   
 [52] "chr4_JH584325_alt"    "chr4_JH584326_alt"    "chr5_GL455995_alt"   
 [55] "chr5_GL456011_alt"    "chr6_GL456012_alt"    "chr6_GL456025_alt"   
 [58] "chr6_GL456026_alt"    "chr6_GL456054_alt"    "chr6_GL456065_alt"   
 [61] "chr6_JH584264_alt"    "chr7_GL455989_alt"    "chr7_GL456013_alt"   
 [64] "chr7_GL456014_alt"    "chr8_GL455996_alt"    "chr8_GL455997_alt"   
 [67] "chr10_GL456015_alt"   "chr11_GL455998_alt"   "chr11_GL456016_alt"  
 [70] "chr11_GL456060_alt"   "chr11_JH584265_alt"   "chr11_JH584316_alt"  
 [73] "chr11_JH584317_alt"   "chr11_JH584327_alt"   "chr12_GL456017_alt"  
 [76] "chr12_GL456068_alt"   "chr12_GL456074_alt"   "chr12_GL456078_alt"  
 [79] "chr12_GL456349_alt"   "chr13_GL455990_alt"   "chr13_GL455999_alt"  
 [82] "chr13_JH584305_alt"   "chr14_GL456019_alt"   "chr14_GL456020_alt"  
 [85] "chr15_GL456000_alt"   "chr15_JH584270_alt"   "chr16_GL456001_alt"  
 [88] "chr16_GL456028_alt"   "chr16_JH584306_alt"   "chr16_JH584307_alt"  
 [91] "chr16_JH584310_alt"   "chr16_JH584311_alt"   "chr16_JH584312_alt"  
 [94] "chr16_JH584313_alt"   "chr16_JH584314_alt"   "chr17_GL456002_alt"  
 [97] "chr17_GL456021_alt"   "chr17_GL456022_alt"   "chr17_GL456069_alt"  
[100] "chr17_JH584266_alt"   "chr17_JH584267_alt"   "chr17_JH584308_alt"  
[103] "chr17_JH584309_alt"   "chr17_JH584328_alt"   "chr17_JH590470_alt"  
[106] "chr18_GL456070_alt"   "chr18_JH584318_alt"   "chr19_GL456079_alt"  
[109] "chr19_JH584319_alt"   "chrX_GL456003_alt"    "chrX_GL456004_alt"   
[112] "chrX_GL456031_alt"    "chrX_GL456032_alt"    "chrX_GL456033_alt"   
[115] "chrY_GL456071_alt"    "chrY_GL456072_alt"    "chrY_GL456073_alt"   
[118] "chrY_GL456080_alt"    "chrY_GL456081_alt"    "chrY_GL456082_alt"   
[121] "chrna_GL456050_alt"   "chr1_GL456210_random" "chr1_GL456211_random"
[124] "chr1_GL456212_random" "chr1_GL456213_random" "chr1_GL456221_random"
[127] "chr4_GL456216_random" "chr4_GL456350_random" "chr4_JH584292_random"
[130] "chr4_JH584293_random" "chr4_JH584294_random" "chr4_JH584295_random"
[133] "chr5_GL456354_random" "chr5_JH584296_random" "chr5_JH584297_random"
[136] "chr5_JH584298_random" "chr5_JH584299_random" "chr7_GL456219_random"
[139] "chrX_GL456233_random" "chrY_JH584300_random" "chrY_JH584301_random"
[142] "chrY_JH584302_random" "chrY_JH584303_random" "chrUn_GL456239"      
[145] "chrUn_GL456359"       "chrUn_GL456360"       "chrUn_GL456366"      
[148] "chrUn_GL456367"       "chrUn_GL456368"       "chrUn_GL456370"      
[151] "chrUn_GL456372"       "chrUn_GL456378"       "chrUn_GL456379"      
[154] "chrUn_GL456381"       "chrUn_GL456382"       "chrUn_GL456383"      
[157] "chrUn_GL456385"       "chrUn_GL456387"       "chrUn_GL456389"      
[160] "chrUn_GL456390"       "chrUn_GL456392"       "chrUn_GL456393"      
[163] "chrUn_GL456394"       "chrUn_GL456396"       "chrUn_JH584304"      
[166] "chr1_KV575232_fix"    "chr1_KV575233_fix"    "chr1_KV575234_fix"   
[169] "chr2_KV575235_fix"    "chr2_KV575236_fix"    "chr3_KQ030484_fix"   
[172] "chr3_KZ289064_fix"    "chr3_KZ289065_fix"    "chr3_KZ289066_fix"   
[175] "chr4_JH792826_fix"    "chr4_KQ030485_fix"    "chr4_KQ030486_fix"   
[178] "chr4_KQ030487_fix"    "chr4_KQ030488_fix"    "chr4_KQ030489_fix"   
[181] "chr4_KZ289067_fix"    "chr4_KZ289068_fix"    "chr4_KZ289069_fix"   
[184] "chr4_KZ289070_fix"    "chr5_JH792827_fix"    "chr5_KV575237_fix"   
[187] "chr6_KK082442_fix"    "chr6_KK082443_fix"    "chr7_JH792828_fix"   
[190] "chr7_KV575238_fix"    "chr7_KV575239_fix"    "chr8_KV575240_fix"   
[193] "chr9_KB469738_fix"    "chr9_KQ030490_fix"    "chr10_KQ030491_fix"  
[196] "chr10_KZ289071_fix"   "chr10_KZ289072_fix"   "chr11_KB469739_fix"  
[199] "chr11_KZ289076_fix"   "chr12_KB469740_fix"   "chr12_KZ289082_fix"  
[202] "chr14_KZ289083_fix"   "chr14_KZ289084_fix"   "chr15_KQ030492_fix"  
[205] "chr15_KQ030493_fix"   "chr15_KV575241_fix"   "chr15_KZ289085_fix"  
[208] "chr15_KZ289086_fix"   "chr16_KB469741_fix"   "chr17_KB469742_fix"  
[211] "chr17_KZ289087_fix"   "chr17_KZ289088_fix"   "chr17_KZ289089_fix"  
[214] "chr18_JH792829_fix"   "chr18_KZ289090_fix"   "chr18_KZ289091_fix"  
[217] "chr19_JH792830_fix"   "chr19_KQ030494_fix"   "chr19_KV575242_fix"  
[220] "chrX_JH792831_fix"    "chrX_KQ030495_fix"    "chrX_KQ030496_fix"   
[223] "chrX_KQ030497_fix"    "chrX_KZ289092_fix"    "chrX_KZ289093_fix"   
[226] "chrX_KZ289094_fix"    "chrX_KZ289095_fix"    "chrY_JH792832_fix"   
[229] "chrY_JH792833_fix"    "chrY_JH792834_fix"    "chr1_KK082441_alt"   
[232] "chr11_KZ289073_alt"   "chr11_KZ289074_alt"   "chr11_KZ289075_alt"  
[235] "chr11_KZ289077_alt"   "chr11_KZ289078_alt"   "chr11_KZ289079_alt"  
[238] "chr11_KZ289080_alt"   "chr11_KZ289081_alt"

How is that possible? What if I don't need all these just the normal chr1-19 + chrX,Y,M? Can I get rid of the unnecessary ones?

BSgenome.Mmusculus.UCSC.mm10 • 1.3k views
ADD COMMENT
3
Entering edit mode
@james-w-macdonald-5106
Last seen 19 hours ago
United States

Those are all the chromosomes, plus alternate haplotypes and patches for that species. If we only sequenced one animal for each species, it would be simple and nice because we would just have the standard chromosomes. But as we sequence more and more individuals the whole thing just keeps getting more and more complicated, and accounting for the complication is itself complicated.

You cannot 'get rid of' parts of a BSgenome package. Depending on what you are doing, you can exclude parts of the package, and if what you are doing involves using a GRanges to extract information, then by default you can exclude the alternative sequences (unless you have alternate sequences in your GRanges).

You could build your own BSgenome package that just contains the standard chromosomes, but without knowing what the problem is, it's hard to say if the extra effort is worth it.

ADD COMMENT
2
Entering edit mode
Robert Castelo ★ 3.4k
@rcastelo
Last seen 5 weeks ago
Barcelona/Universitat Pompeu Fabra

On top of the clarifications given by James and Hervé, you may want to know that the package GenomeInfoDb, which is automatically loaded when you load a BSgenome package, provides a function called standardChromosomes() to fetch what you call the "normal" chromosomes:

library("BSgenome.Mmusculus.UCSC.mm10")
mouse <- BSgenome.Mmusculus.UCSC.mm10
mmstdchr <- standardChromosomes(mouse)
mmstdchr
 [1] "chr1"  "chr2"  "chr3"  "chr4"  "chr5"  "chr6"  "chr7"  "chr8"  "chr9" 
[10] "chr10" "chr11" "chr12" "chr13" "chr14" "chr15" "chr16" "chr17" "chr18"
[19] "chr19" "chrX"  "chrY"  "chrM"

so that you can operate on those chromosomes without having to "hardcode" their names, e.g.:

seqlengths(mouse)[mmstdchr]
     chr1      chr2      chr3      chr4      chr5      chr6      chr7      chr8 
195471971 182113224 160039680 156508116 151834684 149736546 145441459 129401213 
     chr9     chr10     chr11     chr12     chr13     chr14     chr15     chr16 
124595110 130694993 122082543 120129022 120421639 124902244 104043685  98207768 
    chr17     chr18     chr19      chrX      chrY      chrM 
 94987271  90702639  61431566 171031299  91744698     16299 

getSeq(mouse, mmstdchr[22])
16299-letter DNAString object
seq: GTTAATGTAGCTTAATAACAAAGCAAAGCACTGAAA...TCTAATCATACTCTATTACGCAATAAACATTAACAA
ADD COMMENT
0
Entering edit mode
@herve-pages-1542
Last seen 6 hours ago
Seattle, WA, United States

Also just to be clear: the mm10 genome is from the UCSC genome browser and is itself based on the GRCm38.p6 assembly from the Genome Reference Consortium: https://www.ncbi.nlm.nih.gov/assembly/GCF_000001635.26/ We don't control what sequences are in this assembly. We just wrap the whole thing in a BSgenome package.

Best,

H.

ADD COMMENT

Login before adding your answer.

Traffic: 985 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6