Retrieve chromosome data using GenomeInfo!
Using AssemblyInfo, we will retrieve the chromosome sizes from mm9!
[1]:
!pip3 install assemblyinfo
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: assemblyinfo in /home/alejandro/.local/lib/python3.11/site-packages (0.0.1)
Requirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from assemblyinfo) (1.24.3)
Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (1.5.3)
Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (16.1.0)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2023.3.post1)
Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->assemblyinfo) (1.16.0)
[2]:
import assemblyinfo
[3]:
# use the connect() method to retrieve our database!
db = assemblyinfo.connect()
[5]:
# you can easily see whhich assemblies are available by running:
db.available_assemblies()
[5]:
['WS144',
'WBcel215',
'WBcel235',
'WS190',
'WS195',
'GRCh37',
'GRCh38',
'NCBI35',
'NCBI36',
'MGSCv37',
'GRCm38',
'GRCm39',
'MGSCv34',
'MGSCv35',
'MGSCv36',
'Release_5',
'Release_6',
'Release_6_plus_ISO1_MT',
'Zv8',
'Zv9',
'GRCz10',
'GRCz11',
'Zv7',
'ARS-UCD1.1',
'ARS-UCD1.2',
'ARS-UCD1.3',
'ARS-UCD2.0',
'bGalGal1.mat.broiler.GRCg7b',
'UU_Cfam_GSD_1.0',
'T2T-CHM13',
'ASM3317019v1',
'ASM3317019v2',
'CanFam2.0',
'CanFam3.1',
'Dog10K_Boxer_Tasha',
'ROS_Cfam_1.0',
'Gallus_gallus-2.1',
'Gallus_gallus-4.0',
'Gallus_gallus-5.0',
'GRCg6',
'GRCg6a',
'UMICH_Zoey_3.1',
'ASM2820141v1',
<NA>,
'ce11',
'ce6',
'hg19',
'hg38',
'hg17',
'hg18',
'mm9',
'mm10',
'mm39',
'mm6',
'mm7',
'mm8',
'dm3',
'dm6',
'danRer6',
'danRer7',
'danRer10',
'danRer11',
'danRer5',
'bosTau9',
'galGal7',
'canFam4',
'hs1',
'canFam2',
'canFam3',
'canFam6',
'ROS_Cfam_1.0',
'galGal3',
'galGal4',
'galGal5',
'galGal6',
'canFam5']
[6]:
# to get the chromosome sizes you just need to use extract the data using 'assembly_info()'
# here, we are specifying that we want only assembled chromosomes to bypass
# scaffolds or unplaced sequences
mm9 = db.assembly_info('mm9', roles=["assembled"])
[8]:
# let's check out the result!
mm9.chromsizes
[8]:
name
chr1 197195432
chr2 181748087
chr3 159599783
chr4 155630120
chr5 152537259
chr6 149517037
chr7 152524553
chr8 131738871
chr9 124076172
chr10 129993255
chr11 121843856
chr12 121257530
chr13 120284312
chr14 125194864
chr15 103494974
chr16 98319150
chr17 95272651
chr18 90772031
chr19 61342430
chrX 166650296
chrY 15902555
chrM 16299
Name: length, dtype: Int64