Change between chromosome naming nomenclatures easily!
Have problems on chromosome naming nomenclatures? GenomeInfo provides a simple interface to interchange them!
[1]:
!pip3 install assemblyinfo
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: assemblyinfo in /home/alejandro/.local/lib/python3.11/site-packages (0.0.1)
Requirement already satisfied: numpy<2,>=1.10 in /usr/lib64/python3.11/site-packages (from assemblyinfo) (1.24.3)
Requirement already satisfied: pandas>=1.3 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (1.5.3)
Requirement already satisfied: pyarrow>=5.0 in /home/alejandro/.local/lib/python3.11/site-packages (from assemblyinfo) (16.1.0)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3.11/site-packages (from pandas>=1.3->assemblyinfo) (2023.3.post1)
Requirement already satisfied: six>=1.5 in /usr/lib/python3.11/site-packages (from python-dateutil>=2.8.1->pandas>=1.3->assemblyinfo) (1.16.0)
[2]:
import assemblyinfo
[3]:
# use the connect() method to retrieve our database!
db = assemblyinfo.connect()
[5]:
# you can easily see what assemblies are available for each species using 'get_species_info()'
db.get_species_info("homo_sapiens")
[5]:
'Genome Information for homo_sapiens:\n===================\nCommon Names:\n - human\n\nAssemblies (UCSC):\n - hg19, hg38, hg17, hg18, hs1\n\nAssemblies (NCBI):\n - GRCh37, GRCh38, NCBI35, NCBI36, T2T-CHM13\n\n'
[10]:
# if you only need AssemblyInfo as wrapper to extract names of assembled chromosomes:
hg38 = db.assembly_info("hg38", roles=["assembled"], provider="ncbi")
hg38.chromnames
[10]:
['1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
'10',
'11',
'12',
'13',
'14',
'15',
'16',
'17',
'18',
'19',
'20',
'21',
'22',
'X',
'Y',
'MT']
[11]:
# what if (for some strange reason) we need the names of unplaced sequences in mm10?
mm10 = db.assembly_info("mm10", roles=["unplaced"], provider="ncbi")
mm10.chromnames
[11]:
['MSCHRUN_CTG1',
'MSCHRUN_CTG13',
'MSCHRUN_CTG14',
'MSCHRUN_CTG21',
'MSCHRUN_CTG2',
'MSCHRUN_CTG22',
'MSCHRUN_CTG19',
'MSCHRUN_CTG16',
'MSCHRUN_CTG3',
'MSCHRUN_CTG20',
'MSCHRUN_CTG4',
'MSCHRUN_CTG5',
'MSCHRUN_CTG6',
'MSCHRUN_CTG7',
'MSCHRUN_CTG17',
'MSCHRUN_CTG18',
'MSCHRUN_CTG9',
'MSCHRUN_CTG10',
'MSCHRUN_CTG11',
'MSCHRUN_CTG12',
'MSCHRUN_CTG15',
'MSCHRUN_CTG23']
[12]:
# now unplace and unlocalized sequences, but in UCSC format?
mm10 = db.assembly_info("mm10", roles=["unplaced", "unlocalized"], provider="ucsc")
mm10.chromnames
[12]:
['chr1_GL456210_random',
'chr1_GL456211_random',
'chr1_GL456212_random',
'chr1_GL456213_random',
'chr1_GL456221_random',
'chr4_GL456216_random',
'chr4_GL456350_random',
'chr4_JH584292_random',
'chr4_JH584293_random',
'chr4_JH584294_random',
'chr4_JH584295_random',
'chr5_GL456354_random',
'chr5_JH584296_random',
'chr5_JH584297_random',
'chr5_JH584298_random',
'chr5_JH584299_random',
'chr7_GL456219_random',
'chrX_GL456233_random',
'chrY_JH584300_random',
'chrY_JH584301_random',
'chrY_JH584302_random',
'chrY_JH584303_random',
'chrUn_GL456239',
'chrUn_GL456359',
'chrUn_GL456360',
'chrUn_GL456366',
'chrUn_GL456367',
'chrUn_GL456368',
'chrUn_GL456370',
'chrUn_GL456372',
'chrUn_GL456378',
'chrUn_GL456379',
'chrUn_GL456381',
'chrUn_GL456382',
'chrUn_GL456383',
'chrUn_GL456385',
'chrUn_GL456387',
'chrUn_GL456389',
'chrUn_GL456390',
'chrUn_GL456392',
'chrUn_GL456393',
'chrUn_GL456394',
'chrUn_GL456396',
'chrUn_JH584304']