Retrieve chromosome data using GenomeInfo!

Using AssemblyInfo, we will retrieve the chromosome sizes from mm9!

[1]:
!pip3 install assemblyinfo
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: assemblyinfo in /home/alejandro/.local/lib/python3.11/site-packages (0.0.1)
Requirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from assemblyinfo) (1.24.3)
Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (1.5.3)
Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (16.1.0)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2023.3.post1)
Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->assemblyinfo) (1.16.0)
[2]:
import assemblyinfo
[3]:
# use the connect() method to retrieve our database!

db = assemblyinfo.connect()
[5]:
# you can easily see whhich assemblies are available by running:

db.available_assemblies()
[5]:
['WS144',
 'WBcel215',
 'WBcel235',
 'WS190',
 'WS195',
 'GRCh37',
 'GRCh38',
 'NCBI35',
 'NCBI36',
 'MGSCv37',
 'GRCm38',
 'GRCm39',
 'MGSCv34',
 'MGSCv35',
 'MGSCv36',
 'Release_5',
 'Release_6',
 'Release_6_plus_ISO1_MT',
 'Zv8',
 'Zv9',
 'GRCz10',
 'GRCz11',
 'Zv7',
 'ARS-UCD1.1',
 'ARS-UCD1.2',
 'ARS-UCD1.3',
 'ARS-UCD2.0',
 'bGalGal1.mat.broiler.GRCg7b',
 'UU_Cfam_GSD_1.0',
 'T2T-CHM13',
 'ASM3317019v1',
 'ASM3317019v2',
 'CanFam2.0',
 'CanFam3.1',
 'Dog10K_Boxer_Tasha',
 'ROS_Cfam_1.0',
 'Gallus_gallus-2.1',
 'Gallus_gallus-4.0',
 'Gallus_gallus-5.0',
 'GRCg6',
 'GRCg6a',
 'UMICH_Zoey_3.1',
 'ASM2820141v1',
 <NA>,
 'ce11',
 'ce6',
 'hg19',
 'hg38',
 'hg17',
 'hg18',
 'mm9',
 'mm10',
 'mm39',
 'mm6',
 'mm7',
 'mm8',
 'dm3',
 'dm6',
 'danRer6',
 'danRer7',
 'danRer10',
 'danRer11',
 'danRer5',
 'bosTau9',
 'galGal7',
 'canFam4',
 'hs1',
 'canFam2',
 'canFam3',
 'canFam6',
 'ROS_Cfam_1.0',
 'galGal3',
 'galGal4',
 'galGal5',
 'galGal6',
 'canFam5']
[6]:
# to get the chromosome sizes you just need to use extract the data using 'assembly_info()'
# here, we are specifying that we want only assembled chromosomes to bypass
# scaffolds or unplaced sequences

mm9 = db.assembly_info('mm9', roles=["assembled"])
[8]:
# let's check out the result!

mm9.chromsizes
[8]:
name
chr1     197195432
chr2     181748087
chr3     159599783
chr4     155630120
chr5     152537259
chr6     149517037
chr7     152524553
chr8     131738871
chr9     124076172
chr10    129993255
chr11    121843856
chr12    121257530
chr13    120284312
chr14    125194864
chr15    103494974
chr16     98319150
chr17     95272651
chr18     90772031
chr19     61342430
chrX     166650296
chrY      15902555
chrM         16299
Name: length, dtype: Int64