Change between chromosome naming nomenclatures easily!

Have problems on chromosome naming nomenclatures? GenomeInfo provides a simple interface to interchange them!

[1]:
!pip3 install assemblyinfo
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: assemblyinfo in /home/alejandro/.local/lib/python3.11/site-packages (0.0.1)
Requirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from assemblyinfo) (1.24.3)
Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (1.5.3)
Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (16.1.0)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2023.3.post1)
Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->assemblyinfo) (1.16.0)
[2]:
import assemblyinfo
[3]:
# use the connect() method to retrieve our database!

db = assemblyinfo.connect()
[5]:
# you can easily see what assemblies are available for each species using 'get_species_info()'

db.get_species_info("homo_sapiens")
[5]:
'Genome Information for homo_sapiens:\n===================\nCommon Names:\n  - human\n\nAssemblies (UCSC):\n  - hg19, hg38, hg17, hg18, hs1\n\nAssemblies (NCBI):\n  - GRCh37, GRCh38, NCBI35, NCBI36, T2T-CHM13\n\n'
[10]:
# if you only need AssemblyInfo as wrapper to extract names of assembled chromosomes:

hg38 = db.assembly_info("hg38", roles=["assembled"], provider="ncbi")
hg38.chromnames
[10]:
['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 'X',
 'Y',
 'MT']
[11]:
# what if (for some strange reason) we need the names of unplaced sequences in mm10?

mm10 = db.assembly_info("mm10", roles=["unplaced"], provider="ncbi")
mm10.chromnames
[11]:
['MSCHRUN_CTG1',
 'MSCHRUN_CTG13',
 'MSCHRUN_CTG14',
 'MSCHRUN_CTG21',
 'MSCHRUN_CTG2',
 'MSCHRUN_CTG22',
 'MSCHRUN_CTG19',
 'MSCHRUN_CTG16',
 'MSCHRUN_CTG3',
 'MSCHRUN_CTG20',
 'MSCHRUN_CTG4',
 'MSCHRUN_CTG5',
 'MSCHRUN_CTG6',
 'MSCHRUN_CTG7',
 'MSCHRUN_CTG17',
 'MSCHRUN_CTG18',
 'MSCHRUN_CTG9',
 'MSCHRUN_CTG10',
 'MSCHRUN_CTG11',
 'MSCHRUN_CTG12',
 'MSCHRUN_CTG15',
 'MSCHRUN_CTG23']
[12]:
# now unplace and unlocalized sequences, but in UCSC format?

mm10 = db.assembly_info("mm10", roles=["unplaced", "unlocalized"], provider="ucsc")
mm10.chromnames
[12]:
['chr1_GL456210_random',
 'chr1_GL456211_random',
 'chr1_GL456212_random',
 'chr1_GL456213_random',
 'chr1_GL456221_random',
 'chr4_GL456216_random',
 'chr4_GL456350_random',
 'chr4_JH584292_random',
 'chr4_JH584293_random',
 'chr4_JH584294_random',
 'chr4_JH584295_random',
 'chr5_GL456354_random',
 'chr5_JH584296_random',
 'chr5_JH584297_random',
 'chr5_JH584298_random',
 'chr5_JH584299_random',
 'chr7_GL456219_random',
 'chrX_GL456233_random',
 'chrY_JH584300_random',
 'chrY_JH584301_random',
 'chrY_JH584302_random',
 'chrY_JH584303_random',
 'chrUn_GL456239',
 'chrUn_GL456359',
 'chrUn_GL456360',
 'chrUn_GL456366',
 'chrUn_GL456367',
 'chrUn_GL456368',
 'chrUn_GL456370',
 'chrUn_GL456372',
 'chrUn_GL456378',
 'chrUn_GL456379',
 'chrUn_GL456381',
 'chrUn_GL456382',
 'chrUn_GL456383',
 'chrUn_GL456385',
 'chrUn_GL456387',
 'chrUn_GL456389',
 'chrUn_GL456390',
 'chrUn_GL456392',
 'chrUn_GL456393',
 'chrUn_GL456394',
 'chrUn_GL456396',
 'chrUn_JH584304']