Open thommetz opened 3 weeks ago
Hello, Could you please include a full traceback? A corrupted gzip file suggests an accidental download issue. I recommend wiping the cache of omnipath
or pypath
(depending on where the traceback leads us):
# omnipath:
rm ~/.cache/omnipathdb/*
# pypath:
rm ~/.cache/pypath/*
...and try loading the dataset again. It's also good to make sure you perform the downloads with a good network connection. Edit: I see you use Windows, in this case change ~/.cache
according to your system, you can find it by from pypath.share import cache; cache.get_cachedir()
.
Hi, thanks for the prompt answer! Unfortunately even after whiping the cache, the problem still occurs. If using 'human' as organism, it works fine. Here you can find the full traceback.
Downloading data from `https://omnipathdb.org/queries/enzsub?format=json`
Downloading data from `https://omnipathdb.org/queries/interactions?format=json`
Downloading data from `https://omnipathdb.org/queries/complexes?format=json`
Downloading data from `https://omnipathdb.org/queries/annotations?format=json`
Downloading data from `https://omnipathdb.org/queries/intercell?format=json`
Downloading data from `https://omnipathdb.org/about?format=text`
Downloading annotations for all proteins from the following resources: `['PROGENy']`
Downloading data from `https://omnipathdb.org/annotations?format=tsv&resources=PROGENy`
39.0M/? [00:04<00:00, 14.4MB/s]
---------------------------------------------------------------------------
BadGzipFile Traceback (most recent call last)
Cell In[5], line 1
----> 1 progeny = dc.get_progeny(organism="mus musculus", top=500)
File ~\miniconda3\envs\thomas\Lib\site-packages\decoupler\omnip.py:250, in get_progeny(organism, top, genesymbol_resource, **kwargs)
248 p = p.drop('record_id', axis=1)
249 p.columns.name = None
--> 250 p = _annotation_identifiers(p, organism, genesymbol_resource)
251 p = p[['pathway', 'genesymbol', 'weight', 'p_value']]
252 p = p[~p.duplicated(['pathway', 'genesymbol'])]
File ~\miniconda3\envs\thomas\Lib\site-packages\decoupler\omnip.py:919, in _annotation_identifiers(net, organism, genesymbol_resource)
906 def _annotation_identifiers(
907 net: pd.DataFrame,
908 organism: str | int,
(...)
914 ) = None,
915 ) -> pd.DataFrame:
917 if not _is_human(organism):
--> 919 net = translate_net(
920 net,
921 columns='uniprot',
922 id_type='uniprot',
923 source_organism=9606,
924 target_organism=organism,
925 )
927 if genesymbol_resource is False:
929 net['genesymbol'] = net['uniprot']
File ~\miniconda3\envs\thomas\Lib\site-packages\decoupler\omnip.py:758, in translate_net(net, columns, source_organism, target_organism, id_type, unique_by, **kwargs)
755 hom_net = net.copy()
757 # Translate
--> 758 hom_net = orthology.translate_df(
759 df=hom_net,
760 target=_target_organism,
761 cols=columns,
762 source=_source_organism,
763 )
765 unique_by = common.to_list(unique_by)
767 if unique_by and all(c in hom_net.columns for c in unique_by):
768
769 # Remove duplicated based on source and target
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:2262, in translate_df(df, target, source, cols, id_type, only_swissprot, oma, homologene, ensembl, oma_rel_type, oma_score, ensembl_hc, ensembl_types, **kwargs)
2259 args.pop('manager')
2260 args.pop('kwargs')
-> 2262 return manager.translate_df(**args, **kwargs)
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:620, in OrthologyManager.translate_df(self, df, target, source, cols, id_type, only_swissprot, oma, homologene, ensembl, oma_rel_type, oma_score, ensembl_hc, ensembl_types, **kwargs)
618 args.pop('self')
619 args['id_type'] = _id_type
--> 620 ortho_df = self.get_df(**args)
622 table = self.which_table(
623 target = target,
624 source = source,
(...)
627 resource = 'oma',
628 )
630 df = table.translate_df(
631 df = df,
632 cols = [c for c, i in cols.items() if i == _id_type],
633 ortho_df = ortho_df,
634 )
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:515, in OrthologyManager.get_df(self, target, source, id_type, only_swissprot, oma, homologene, ensembl, oma_rel_type, oma_score, ensembl_hc, ensembl_types, full_records, **kwargs)
511 if not param[resource]:
513 continue
--> 515 table = self.which_table(
516 target = target,
517 source = source,
518 only_swissprot = only_swissprot,
519 id_type = id_type,
520 resource = resource,
521 )
523 result.append(
524 table.df(
525 full_records = full_records,
(...)
531 )
532 )
534 return pd.concat(result)
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:233, in OrthologyManager.which_table(self, target, source, only_swissprot, resource, id_type)
229 self.expiry[key] = time.time()
231 if key not in self.tables:
--> 233 self.load(key)
235 if key in self.tables:
237 return self.tables[key]
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:242, in OrthologyManager.load(self, key)
240 def load(self, key):
--> 242 self.tables[key] = globals()[f'{key.resource.capitalize()}Orthology'](
243 target = key.target,
244 source = key.source,
245 only_swissprot = key.only_swissprot,
246 id_type = key.id_type,
247 )
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:1492, in OmaOrthology.__init__(self, target, source, id_type, only_swissprot, rel_type, score)
1462 def __init__(
1463 self,
1464 target: int | str,
(...)
1472 score: float | None = None,
1473 ):
1474 """
1475 Orthology translation with Ensembl data.
1476
(...)
1489 Lower threshold for similarity metric.
1490 """
-> 1492 ProteinOrthology.__init__(**locals())
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:869, in ProteinOrthology.__init__(self, target, source, id_type, only_swissprot, **kwargs)
867 self.load_proteome(self.source)
868 self._set_param(kwargs, *self._param)
--> 869 self.load()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:1506, in OmaOrthology.load(self)
1502 if self._from_pickle():
1504 return
-> 1506 oma_data = oma_input.oma_orthologs(
1507 organism_a = self.source,
1508 organism_b = self.target,
1509 id_type = self.id_type,
1510 )
1511 self.data = collections.defaultdict(set)
1513 for rec in oma_data:
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\oma.py:125, in oma_orthologs(organism_a, organism_b, id_type, rel_type, score, return_df)
119 if (
120 (score and rec['score'] < score) or
121 (rel_type and rec['rel_type'] not in rel_type)
122 ):
123 continue
--> 125 a, b = (
126 [
127 OmaGene(
128 id = id_,
129 oma_group = e['oma_group'],
130 hog = e['oma_hog_id'],
131 taxon = e['species']['taxon_id'],
132 chr = e['chromosome'],
133 start = int(e['locus']['start']),
134 end = int(e['locus']['end']),
135 strand = int(e['locus']['strand']),
136 main_isoform = e['is_main_isoform'],
137
138 )
139 for id_ in _id_translate(
140 id_ = e['canonicalid'],
141 taxon = e['species']['taxon_id'],
142 id_type = id_type,
143 )
144 ]
145 for e in (rec[f'entry_{ei}'] for ei in (1, 2))
146 )
149 result.update(
150 {
151 OmaOrthology(
(...)
159 }
160 )
162 if page > n_pages: break
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\oma.py:139, in <genexpr>(.0)
119 if (
120 (score and rec['score'] < score) or
121 (rel_type and rec['rel_type'] not in rel_type)
122 ):
123 continue
125 a, b = (
126 [
127 OmaGene(
128 id = id_,
129 oma_group = e['oma_group'],
130 hog = e['oma_hog_id'],
131 taxon = e['species']['taxon_id'],
132 chr = e['chromosome'],
133 start = int(e['locus']['start']),
134 end = int(e['locus']['end']),
135 strand = int(e['locus']['strand']),
136 main_isoform = e['is_main_isoform'],
137
138 )
--> 139 for id_ in _id_translate(
140 id_ = e['canonicalid'],
141 taxon = e['species']['taxon_id'],
142 id_type = id_type,
143 )
144 ]
145 for e in (rec[f'entry_{ei}'] for ei in (1, 2))
146 )
149 result.update(
150 {
151 OmaOrthology(
(...)
159 }
160 )
162 if page > n_pages: break
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\oma.py:244, in _id_translate(id_, taxon, id_type)
234 if not id_type: return {id_}
236 s_id_type = (
237 'ensg'
238 if id_.startswith('ENS') else
(...)
241 'uniprot'
242 )
--> 244 uniprots = mapping.map_name(
245 id_,
246 s_id_type,
247 'uniprot',
248 ncbi_tax_id = taxon,
249 )
251 return mapping.map_names(
252 uniprots,
253 'uniprot',
254 id_type,
255 ncbi_tax_id = taxon,
256 ) if uniprots else set()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:3551, in map_name(name, id_type, target_id_type, ncbi_tax_id, strict, expand_complexes, uniprot_cleanup)
3498 """
3499 Translates one instance of one ID type to a different one.
3500 Returns set of the target ID type.
(...)
3546 ID, call the `uniprot_cleanup` function at the end.
3547 """
3549 mapper = get_mapper()
-> 3551 return mapper.map_name(
3552 name = name,
3553 id_type = id_type,
3554 target_id_type = target_id_type,
3555 ncbi_tax_id = ncbi_tax_id,
3556 strict = strict,
3557 expand_complexes = expand_complexes,
3558 uniprot_cleanup = uniprot_cleanup,
3559 )
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath_common\_misc.py:2953, in ignore_unhashable.<locals>.wrapper(*args, **kwargs)
2950 @functools.wraps(func, assigned = attributes)
2951 def wrapper(*args, **kwargs):
2952 try:
-> 2953 return func(*args, **kwargs)
2954 except TypeError as error:
2955 if 'unhashable type' in str(error):
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:2193, in Mapper.map_name(self, name, id_type, target_id_type, ncbi_tax_id, strict, expand_complexes, uniprot_cleanup)
2189 # for UniProt IDs we do a few more steps to
2190 # try to find out the primary SwissProt ID
2191 if uniprot_cleanup and target_id_type == 'uniprot':
-> 2193 mapped_names = self.uniprot_cleanup(
2194 uniprots = mapped_names,
2195 ncbi_tax_id = ncbi_tax_id,
2196 )
2198 return mapped_names
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:2227, in Mapper.uniprot_cleanup(self, uniprots, ncbi_tax_id)
2224 # step 2: translate TrEMBL to SwissProt by gene symbols
2225 if self._trembl_swissprot_by_genesymbol:
-> 2227 uniprots = self.trembl_swissprot(
2228 uniprots,
2229 ncbi_tax_id = ncbi_tax_id,
2230 )
2232 # step 3: translate deleted IDs by gene symbols
2233 if self._translate_deleted_uniprot:
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:2868, in Mapper.trembl_swissprot(self, uniprots, ncbi_tax_id)
2865 for uniprot in uniprots:
2867 swissprot = None
-> 2868 genesymbols = self.map_name(
2869 name = uniprot,
2870 id_type = 'trembl',
2871 target_id_type = 'genesymbol',
2872 ncbi_tax_id = ncbi_tax_id,
2873 )
2875 this_swissprots = self.map_names(
2876 names = genesymbols,
2877 id_type = 'genesymbol',
2878 target_id_type = 'swissprot',
2879 ncbi_tax_id = ncbi_tax_id,
2880 )
2882 if not this_swissprots:
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath_common\_misc.py:2953, in ignore_unhashable.<locals>.wrapper(*args, **kwargs)
2950 @functools.wraps(func, assigned = attributes)
2951 def wrapper(*args, **kwargs):
2952 try:
-> 2953 return func(*args, **kwargs)
2954 except TypeError as error:
2955 if 'unhashable type' in str(error):
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:1978, in Mapper.map_name(self, name, id_type, target_id_type, ncbi_tax_id, strict, expand_complexes, uniprot_cleanup)
1964 mapped_names = self.chain_map(
1965 name = name,
1966 id_type = id_type,
(...)
1972 uniprot_cleanup = uniprot_cleanup,
1973 )
1975 else:
1976
1977 # all the other ID types
-> 1978 mapped_names = self._map_name(
1979 name = name,
1980 id_type = id_type,
1981 target_id_type = target_id_type,
1982 ncbi_tax_id = ncbi_tax_id,
1983 )
1985 # as ID translation tables for PRO IDs are not organism specific
1986 # we need an extra step to limit the results to the target organism
1987 if id_type == 'pro' and target_id_type == 'uniprot':
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:2510, in Mapper._map_name(self, name, id_type, target_id_type, ncbi_tax_id)
2503 """
2504 Once we have defined the name type and the target name type,
2505 this function looks it up in the most suitable dictionary.
2506 """
2508 ncbi_tax_id = ncbi_tax_id or self.ncbi_tax_id
-> 2510 tbl = self.which_table(
2511 id_type,
2512 target_id_type,
2513 ncbi_tax_id = ncbi_tax_id,
2514 )
2516 return tbl[name] if tbl else set()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:1562, in Mapper.which_table(self, id_type, target_id_type, load, ncbi_tax_id)
1551 if resource:
1553 self._log(
1554 'Chosen built-in defined ID translation table: '
1555 'resource=%s, id_type_a=%s, id_type_b=%s' % (
(...)
1559 )
1560 )
-> 1562 self.load_mapping(
1563 resource = resource,
1564 load_a_to_b = load_a_to_b,
1565 load_b_to_a = load_b_to_a,
1566 ncbi_tax_id = ncbi_tax_id,
1567 )
1569 tbl = check_loaded()
1571 break
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:3208, in Mapper.load_mapping(self, resource, **kwargs)
3195 ncbi_tax_id = kwargs.get('ncbi_tax_id', resource.ncbi_tax_id)
3197 self._log(
3198 'Loading mapping table for organism `%s` '
3199 'with identifiers `%s` and `%s`, '
(...)
3205 )
3206 )
-> 3208 reader = MapReader(param = resource, **kwargs)
3210 a_to_b = reader.mapping_table_a_to_b
3211 b_to_a = reader.mapping_table_b_to_a
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:258, in MapReader.__init__(self, param, ncbi_tax_id, entity_type, load_a_to_b, load_b_to_a, uniprots, lifetime, resource_id_types)
255 self.uniprots = uniprots
256 self._resource_id_types = resource_id_types
--> 258 self.load()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:288, in MapReader.load(self)
283 self.read_cache()
285 if not self.tables_loaded():
286
287 # read from the original source
--> 288 self.read()
290 if self.tables_loaded():
291
292 # write cache only at successful loading
293 self.write_cache()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:450, in MapReader.read(self)
446 method = 'read_mapping_%s' % self.source_type
448 if hasattr(self, method):
--> 450 getattr(self, method)()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:893, in MapReader.read_mapping_uniprot(self)
891 protein_name = self.param.field == 'protein names'
892 query.name_process = not protein_name and not trembl
--> 893 data = query.perform()
895 if not query.name_process:
897 def maybe_split(v):
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\uniprot.py:681, in UniprotQuery.perform(self)
669 def perform(self) -> list[str] | dict[str, str] | dict[str, dict[str, str]]:
670 """
671 Perform the query and preprocess the result.
672
(...)
678 kind described in the previous point as values.
679 """
--> 681 _id, *variables = zip(*self)
682 _id = list(map(common.sfirst, _id))
684 if variables:
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\uniprot.py:660, in UniprotQuery.__iter__(self)
657 _proc0 = functools.partial(self._FIELDEND.sub, '')
658 _proc1 = self._FIELDSEP.split if self.name_process else common.identity
--> 660 for line in result:
662 line = line.strip('\n\r')
664 if line.strip():
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\share\curl.py:766, in FileOpener.iterfile(fileobj)
763 @staticmethod
764 def iterfile(fileobj):
--> 766 for line in fileobj:
768 yield line
File ~\miniconda3\envs\thomas\Lib\gzip.py:314, in GzipFile.read1(self, size)
312 if size < 0:
313 size = io.DEFAULT_BUFFER_SIZE
--> 314 return self._buffer.read1(size)
File ~\miniconda3\envs\thomas\Lib\_compression.py:68, in DecompressReader.readinto(self, b)
66 def readinto(self, b):
67 with memoryview(b) as view, view.cast("B") as byte_view:
---> 68 data = self.read(len(byte_view))
69 byte_view[:len(data)] = data
70 return len(data)
File ~\miniconda3\envs\thomas\Lib\gzip.py:490, in _GzipReader.read(self, size)
484 while True:
485 if self._decompressor.eof:
486 # Ending case: we've come to the end of a member in the file,
487 # so finish up this member, and read a new gzip header.
488 # Check the CRC and file size, and set the flag so we read
489 # a new member
--> 490 self._read_eof()
491 self._new_member = True
492 self._decompressor = self._decomp_factory(
493 **self._decomp_args)
File ~\miniconda3\envs\thomas\Lib\gzip.py:536, in _GzipReader._read_eof(self)
534 crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
535 if crc32 != self._crc:
--> 536 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
537 hex(self._crc)))
538 elif isize != (self._stream_size & 0xffffffff):
539 raise BadGzipFile("Incorrect length of data produced")
BadGzipFile: CRC check failed 0x6036a815 != 0x17d2a1ca
Apparently a simple query to the basic UniProt API fails. We should check in the logs what was that query. In the directory where you run your session above, probably the bottom of the last log file contains the relevant information:
ls -ltra ./pypath_log/ | tail
Or you can reproduce the error and see the log file from Python:
import decoupler as dc
import pypath
mpg = dc.get_progeny(organism = 'mouse')
# error happens
pypath.log()
In the log there should be the path to the affected cache file, you can check locally what's the actual content of that file, is it indeed a truncated gzip, or is it a HTML error page? And if you could post here the last few dozens of lines from the log, it would be very helpful for me to find out about this error.
I run it again trying to reproduce the error. For some reason it now gives a different error:
https://omnipathdb.org/queries/enzsub?format=json
Downloading data from https://omnipathdb.org/queries/interactions?format=json
Downloading data from https://omnipathdb.org/queries/complexes?format=json
Downloading data from https://omnipathdb.org/queries/annotations?format=json
Downloading data from https://omnipathdb.org/queries/intercell?format=json
Downloading data from https://omnipathdb.org/about?format=text
Downloading annotations for all proteins from the following resources: ['PROGENy']
error Traceback (most recent call last) Cell In[5], line 1 ----> 1 progeny = dc.get_progeny(organism="mouse", top=500)
File ~\miniconda3\envs\thomas\Lib\site-packages\decoupler\omnip.py:250, in get_progeny(organism, top, genesymbol_resource, **kwargs) 248 p = p.drop('record_id', axis=1) 249 p.columns.name = None --> 250 p = _annotation_identifiers(p, organism, genesymbol_resource) 251 p = p[['pathway', 'genesymbol', 'weight', 'p_value']] 252 p = p[~p.duplicated(['pathway', 'genesymbol'])]
File ~\miniconda3\envs\thomas\Lib\site-packages\decoupler\omnip.py:919, in _annotation_identifiers(net, organism, genesymbol_resource) 906 def _annotation_identifiers( 907 net: pd.DataFrame, 908 organism: str | int, (...) 914 ) = None, 915 ) -> pd.DataFrame: 917 if not _is_human(organism): --> 919 net = translate_net( 920 net, 921 columns='uniprot', 922 id_type='uniprot', 923 source_organism=9606, 924 target_organism=organism, 925 ) 927 if genesymbol_resource is False: 929 net['genesymbol'] = net['uniprot']
File ~\miniconda3\envs\thomas\Lib\site-packages\decoupler\omnip.py:758, in translate_net(net, columns, source_organism, target_organism, id_type, unique_by, **kwargs) 755 hom_net = net.copy() 757 # Translate --> 758 hom_net = orthology.translate_df( 759 df=hom_net, 760 target=_target_organism, 761 cols=columns, 762 source=_source_organism, 763 ) 765 unique_by = common.to_list(unique_by) 767 if unique_by and all(c in hom_net.columns for c in unique_by): 768 769 # Remove duplicated based on source and target
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:2262, in translate_df(df, target, source, cols, id_type, only_swissprot, oma, homologene, ensembl, oma_rel_type, oma_score, ensembl_hc, ensembl_types, kwargs) 2259 args.pop('manager') 2260 args.pop('kwargs') -> 2262 return manager.translate_df(args, **kwargs)
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:620, in OrthologyManager.translate_df(self, df, target, source, cols, id_type, only_swissprot, oma, homologene, ensembl, oma_rel_type, oma_score, ensembl_hc, ensembl_types, kwargs) 618 args.pop('self') 619 args['id_type'] = _id_type --> 620 ortho_df = self.get_df(args) 622 table = self.which_table( 623 target = target, 624 source = source, (...) 627 resource = 'oma', 628 ) 630 df = table.translate_df( 631 df = df, 632 cols = [c for c, i in cols.items() if i == _id_type], 633 ortho_df = ortho_df, 634 )
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:515, in OrthologyManager.get_df(self, target, source, id_type, only_swissprot, oma, homologene, ensembl, oma_rel_type, oma_score, ensembl_hc, ensembl_types, full_records, **kwargs) 511 if not param[resource]: 513 continue --> 515 table = self.which_table( 516 target = target, 517 source = source, 518 only_swissprot = only_swissprot, 519 id_type = id_type, 520 resource = resource, 521 ) 523 result.append( 524 table.df( 525 full_records = full_records, (...) 531 ) 532 ) 534 return pd.concat(result)
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:233, in OrthologyManager.which_table(self, target, source, only_swissprot, resource, id_type) 229 self.expiry[key] = time.time() 231 if key not in self.tables: --> 233 self.load(key) 235 if key in self.tables: 237 return self.tables[key]
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:242, in OrthologyManager.load(self, key) 240 def load(self, key): --> 242 self.tables[key] = globals()[f'{key.resource.capitalize()}Orthology']( 243 target = key.target, 244 source = key.source, 245 only_swissprot = key.only_swissprot, 246 id_type = key.id_type, 247 )
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:1492, in OmaOrthology.init(self, target, source, id_type, only_swissprot, rel_type, score) 1462 def init( 1463 self, 1464 target: int | str, (...) 1472 score: float | None = None, 1473 ): 1474 """ 1475 Orthology translation with Ensembl data. 1476 (...) 1489 Lower threshold for similarity metric. 1490 """ -> 1492 ProteinOrthology.init(**locals())
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:869, in ProteinOrthology.init(self, target, source, id_type, only_swissprot, *kwargs) 867 self.load_proteome(self.source) 868 self._set_param(kwargs, self._param) --> 869 self.load()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\orthology.py:1506, in OmaOrthology.load(self) 1502 if self._from_pickle(): 1504 return -> 1506 oma_data = oma_input.oma_orthologs( 1507 organism_a = self.source, 1508 organism_b = self.target, 1509 id_type = self.id_type, 1510 ) 1511 self.data = collections.defaultdict(set) 1513 for rec in oma_data:
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\oma.py:125, in oma_orthologs(organism_a, organism_b, id_type, rel_type, score, return_df) 119 if ( 120 (score and rec['score'] < score) or 121 (rel_type and rec['rel_type'] not in reltype) 122 ): 123 continue --> 125 a, b = ( 126 [ 127 OmaGene( 128 id = id, 129 oma_group = e['oma_group'], 130 hog = e['oma_hog_id'], 131 taxon = e['species']['taxon_id'], 132 chr = e['chromosome'], 133 start = int(e['locus']['start']), 134 end = int(e['locus']['end']), 135 strand = int(e['locus']['strand']), 136 main_isoform = e['is_mainisoform'], 137 138 ) 139 for id in _idtranslate( 140 id = e['canonicalid'], 141 taxon = e['species']['taxon_id'], 142 id_type = idtype, 143 ) 144 ] 145 for e in (rec[f'entry{ei}'] for ei in (1, 2)) 146 ) 149 result.update( 150 { 151 OmaOrthology( (...) 159 } 160 ) 162 if page > n_pages: break
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\oma.py:139, in
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\oma.py:244, in _idtranslate(id, taxon, id_type) 234 if not idtype: return {id} 236 s_idtype = ( 237 'ensg' 238 if id.startswith('ENS') else (...) 241 'uniprot' 242 ) --> 244 uniprots = mapping.mapname( 245 id, 246 s_id_type, 247 'uniprot', 248 ncbi_tax_id = taxon, 249 ) 251 return mapping.map_names( 252 uniprots, 253 'uniprot', 254 id_type, 255 ncbi_tax_id = taxon, 256 ) if uniprots else set()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:3551, in map_name(name, id_type, target_id_type, ncbi_tax_id, strict, expand_complexes, uniprot_cleanup)
3498 """
3499 Translates one instance of one ID type to a different one.
3500 Returns set of the target ID type.
(...)
3546 ID, call the uniprot_cleanup
function at the end.
3547 """
3549 mapper = get_mapper()
-> 3551 return mapper.map_name(
3552 name = name,
3553 id_type = id_type,
3554 target_id_type = target_id_type,
3555 ncbi_tax_id = ncbi_tax_id,
3556 strict = strict,
3557 expand_complexes = expand_complexes,
3558 uniprot_cleanup = uniprot_cleanup,
3559 )
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath_common_misc.py:2953, in ignore_unhashable.
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:2193, in Mapper.map_name(self, name, id_type, target_id_type, ncbi_tax_id, strict, expand_complexes, uniprot_cleanup) 2189 # for UniProt IDs we do a few more steps to 2190 # try to find out the primary SwissProt ID 2191 if uniprot_cleanup and target_id_type == 'uniprot': -> 2193 mapped_names = self.uniprot_cleanup( 2194 uniprots = mapped_names, 2195 ncbi_tax_id = ncbi_tax_id, 2196 ) 2198 return mapped_names
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:2227, in Mapper.uniprot_cleanup(self, uniprots, ncbi_tax_id) 2224 # step 2: translate TrEMBL to SwissProt by gene symbols 2225 if self._trembl_swissprot_by_genesymbol: -> 2227 uniprots = self.trembl_swissprot( 2228 uniprots, 2229 ncbi_tax_id = ncbi_tax_id, 2230 ) 2232 # step 3: translate deleted IDs by gene symbols 2233 if self._translate_deleted_uniprot:
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:2868, in Mapper.trembl_swissprot(self, uniprots, ncbi_tax_id) 2865 for uniprot in uniprots: 2867 swissprot = None -> 2868 genesymbols = self.map_name( 2869 name = uniprot, 2870 id_type = 'trembl', 2871 target_id_type = 'genesymbol', 2872 ncbi_tax_id = ncbi_tax_id, 2873 ) 2875 this_swissprots = self.map_names( 2876 names = genesymbols, 2877 id_type = 'genesymbol', 2878 target_id_type = 'swissprot', 2879 ncbi_tax_id = ncbi_tax_id, 2880 ) 2882 if not this_swissprots:
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath_common_misc.py:2953, in ignore_unhashable.
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:1978, in Mapper.map_name(self, name, id_type, target_id_type, ncbi_tax_id, strict, expand_complexes, uniprot_cleanup) 1964 mapped_names = self.chain_map( 1965 name = name, 1966 id_type = id_type, (...) 1972 uniprot_cleanup = uniprot_cleanup, 1973 ) 1975 else: 1976 1977 # all the other ID types -> 1978 mapped_names = self._map_name( 1979 name = name, 1980 id_type = id_type, 1981 target_id_type = target_id_type, 1982 ncbi_tax_id = ncbi_tax_id, 1983 ) 1985 # as ID translation tables for PRO IDs are not organism specific 1986 # we need an extra step to limit the results to the target organism 1987 if id_type == 'pro' and target_id_type == 'uniprot':
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:2510, in Mapper._map_name(self, name, id_type, target_id_type, ncbi_tax_id) 2503 """ 2504 Once we have defined the name type and the target name type, 2505 this function looks it up in the most suitable dictionary. 2506 """ 2508 ncbi_tax_id = ncbi_tax_id or self.ncbi_tax_id -> 2510 tbl = self.which_table( 2511 id_type, 2512 target_id_type, 2513 ncbi_tax_id = ncbi_tax_id, 2514 ) 2516 return tbl[name] if tbl else set()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:1562, in Mapper.which_table(self, id_type, target_id_type, load, ncbi_tax_id) 1551 if resource: 1553 self._log( 1554 'Chosen built-in defined ID translation table: ' 1555 'resource=%s, id_type_a=%s, id_type_b=%s' % ( (...) 1559 ) 1560 ) -> 1562 self.load_mapping( 1563 resource = resource, 1564 load_a_to_b = load_a_to_b, 1565 load_b_to_a = load_b_to_a, 1566 ncbi_tax_id = ncbi_tax_id, 1567 ) 1569 tbl = check_loaded() 1571 break
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:3208, in Mapper.load_mapping(self, resource, kwargs)
3195 ncbi_tax_id = kwargs.get('ncbi_tax_id', resource.ncbi_tax_id)
3197 self._log(
3198 'Loading mapping table for organism %s
'
3199 'with identifiers %s
and %s
, '
(...)
3205 )
3206 )
-> 3208 reader = MapReader(param = resource, kwargs)
3210 a_to_b = reader.mapping_table_a_to_b
3211 b_to_a = reader.mapping_table_b_to_a
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:258, in MapReader.init(self, param, ncbi_tax_id, entity_type, load_a_to_b, load_b_to_a, uniprots, lifetime, resource_id_types) 255 self.uniprots = uniprots 256 self._resource_id_types = resource_id_types --> 258 self.load()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:288, in MapReader.load(self) 283 self.read_cache() 285 if not self.tables_loaded(): 286 287 # read from the original source --> 288 self.read() 290 if self.tables_loaded(): 291 292 # write cache only at successful loading 293 self.write_cache()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:450, in MapReader.read(self) 446 method = 'readmapping%s' % self.source_type 448 if hasattr(self, method): --> 450 getattr(self, method)()
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\utils\mapping.py:893, in MapReader.read_mapping_uniprot(self) 891 protein_name = self.param.field == 'protein names' 892 query.name_process = not protein_name and not trembl --> 893 data = query.perform() 895 if not query.name_process: 897 def maybe_split(v):
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\uniprot.py:681, in UniprotQuery.perform(self) 669 def perform(self) -> list[str] | dict[str, str] | dict[str, dict[str, str]]: 670 """ 671 Perform the query and preprocess the result. 672 (...) 678 kind described in the previous point as values. 679 """ --> 681 _id, variables = zip(self) 682 _id = list(map(common.sfirst, _id)) 684 if variables:
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\inputs\uniprot.py:656, in UniprotQuery.iter(self) 648 c = curl.Curl( 649 self._baseurl, 650 get = self._get, (...) 653 compr = 'gz', 654 ) 655 result = c.result if c.result or self.fail_onempty else [0].iter() --> 656 = next(result) 657 _proc0 = functools.partial(self._FIELDEND.sub, '') 658 _proc1 = self._FIELDSEP.split if self.name_process else common.identity
File ~\miniconda3\envs\thomas\Lib\site-packages\pypath\share\curl.py:766, in FileOpener.iterfile(fileobj) 763 @staticmethod 764 def iterfile(fileobj): --> 766 for line in fileobj: 768 yield line
File ~\miniconda3\envs\thomas\Lib\gzip.py:314, in GzipFile.read1(self, size) 312 if size < 0: 313 size = io.DEFAULT_BUFFER_SIZE --> 314 return self._buffer.read1(size)
File ~\miniconda3\envs\thomas\Lib_compression.py:68, in DecompressReader.readinto(self, b) 66 def readinto(self, b): 67 with memoryview(b) as view, view.cast("B") as byte_view: ---> 68 data = self.read(len(byte_view)) 69 byte_view[:len(data)] = data 70 return len(data)
File ~\miniconda3\envs\thomas\Lib\gzip.py:507, in _GzipReader.read(self, size) 504 # Read a chunk of data from the file 505 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) --> 507 uncompress = self._decompressor.decompress(buf, size) 508 if self._decompressor.unconsumed_tail != b"": 509 self._fp.prepend(self._decompressor.unconsumed_tail)
error: Error -3 while decompressing data: invalid block type
Describe the bug When using any other organism than 'human', a BadGzipFile error occurs. I tried using different species (mouse, rat, pig, dog,...) and different names and IDs (e.g. mouse, mus musculus, NCBI ID '10090' for mouse).
To Reproduce import decoupler as dc progeny = dc.get_progeny(organism="mus musculus", top=500)
Expected behavior Return of a DataFrame as noted in the documentation of decoupler.get_progeny
System