saezlab / decoupler-py

Python package to perform enrichment analysis from omics data.
https://decoupler-py.readthedocs.io/
GNU General Public License v3.0
168 stars 25 forks source link

Error from Importing MSigDB and progeny of mouse organism (and translate_net) #155

Closed wgsim closed 1 month ago

wgsim commented 2 months ago

Describe the bug A clear and concise description of what the bug is.

Hi, When I used "dc.get_resource('MSigDB', organism='mouse')" or "dc.translate_net(target_organism = 'mouse')", I got this error "error: Error -3 while decompressing data: too many length or distance symbols". I also tried to remove my cache by "rm ~/.cache/omnipathdb/*", but it didn't solve the issue.

Also, when I used "dc.get_progeny(organism = 'mouse', top = 500)", it returned same error. "dc.get_collectri(organism = 'mouse', split_complexes = False)" was fine.

Please give me any advice, thanks!

To Reproduce Steps to reproduce the behavior.

Please provide exact steps to reproduce the bug in a clean Python environment. In case it's not clear what's causing this bug, please provide the data or the data generation procedure. Sometimes it is not possible to share the data but usually it is possible to replicate problems on publicly available datasets or to share a subset of your data.

Expected behavior A clear and concise description of what you expected to happen.

System

Additional context Add any other context about the problem here.

wgsim commented 1 month ago

Hi, @deeenes. I opened a new issue for my problem. And this is my traceback.


error Traceback (most recent call last) Cell In[26], line 6 2 import logging 4 logging.getLogger('omnipath').setLevel('DEBUG') ----> 6 mm_msigdb = dc.get_resource('MSigDB', organism='mouse')

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/decoupler/omnip.py:360, in get_resource(name, organism, genesymbol_resource, **kwargs) 358 df.columns = list(df.columns) 359 df = df.reset_index() --> 360 df = _annotation_identifiers(df, organism, genesymbol_resource) 361 df = df.drop(columns=['record_id', 'uniprot', 'entity_type', 'source']) 362 df = op._misc.dtypes.auto_dtype(df)

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/decoupler/omnip.py:919, in _annotation_identifiers(net, organism, genesymbol_resource) 906 def _annotation_identifiers( 907 net: pd.DataFrame, 908 organism: str | int, (...) 914 ) = None, 915 ) -> pd.DataFrame: 917 if not _is_human(organism): --> 919 net = translate_net( 920 net, 921 columns='uniprot', 922 id_type='uniprot', 923 source_organism=9606, 924 target_organism=organism, 925 ) 927 if genesymbol_resource is False: 929 net['genesymbol'] = net['uniprot']

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/decoupler/omnip.py:758, in translate_net(net, columns, source_organism, target_organism, id_type, unique_by, **kwargs) 755 hom_net = net.copy() 757 # Translate --> 758 hom_net = orthology.translate_df( 759 df=hom_net, 760 target=_target_organism, 761 cols=columns, 762 source=_source_organism, 763 ) 765 unique_by = common.to_list(unique_by) 767 if unique_by and all(c in hom_net.columns for c in unique_by): 768 769 # Remove duplicated based on source and target

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/orthology.py:2262, in translate_df(df, target, source, cols, id_type, only_swissprot, oma, homologene, ensembl, oma_rel_type, oma_score, ensembl_hc, ensembl_types, kwargs) 2259 args.pop('manager') 2260 args.pop('kwargs') -> 2262 return manager.translate_df(args, **kwargs)

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/orthology.py:620, in OrthologyManager.translate_df(self, df, target, source, cols, id_type, only_swissprot, oma, homologene, ensembl, oma_rel_type, oma_score, ensembl_hc, ensembl_types, kwargs) 618 args.pop('self') 619 args['id_type'] = _id_type --> 620 ortho_df = self.get_df(args) 622 table = self.which_table( 623 target = target, 624 source = source, (...) 627 resource = 'oma', 628 ) 630 df = table.translate_df( 631 df = df, 632 cols = [c for c, i in cols.items() if i == _id_type], 633 ortho_df = ortho_df, 634 )

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/orthology.py:515, in OrthologyManager.get_df(self, target, source, id_type, only_swissprot, oma, homologene, ensembl, oma_rel_type, oma_score, ensembl_hc, ensembl_types, full_records, **kwargs) 511 if not param[resource]: 513 continue --> 515 table = self.which_table( 516 target = target, 517 source = source, 518 only_swissprot = only_swissprot, 519 id_type = id_type, 520 resource = resource, 521 ) 523 result.append( 524 table.df( 525 full_records = full_records, (...) 531 ) 532 ) 534 return pd.concat(result)

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/orthology.py:233, in OrthologyManager.which_table(self, target, source, only_swissprot, resource, id_type) 229 self.expiry[key] = time.time() 231 if key not in self.tables: --> 233 self.load(key) 235 if key in self.tables: 237 return self.tables[key]

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/orthology.py:242, in OrthologyManager.load(self, key) 240 def load(self, key): --> 242 self.tables[key] = globals()[f'{key.resource.capitalize()}Orthology']( 243 target = key.target, 244 source = key.source, 245 only_swissprot = key.only_swissprot, 246 id_type = key.id_type, 247 )

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/orthology.py:1492, in OmaOrthology.init(self, target, source, id_type, only_swissprot, rel_type, score) 1462 def init( 1463 self, 1464 target: int | str, (...) 1472 score: float | None = None, 1473 ): 1474 """ 1475 Orthology translation with Ensembl data. 1476 (...) 1489 Lower threshold for similarity metric. 1490 """ -> 1492 ProteinOrthology.init(**locals())

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/orthology.py:869, in ProteinOrthology.init(self, target, source, id_type, only_swissprot, *kwargs) 867 self.load_proteome(self.source) 868 self._set_param(kwargs, self._param) --> 869 self.load()

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/orthology.py:1506, in OmaOrthology.load(self) 1502 if self._from_pickle(): 1504 return -> 1506 oma_data = oma_input.oma_orthologs( 1507 organism_a = self.source, 1508 organism_b = self.target, 1509 id_type = self.id_type, 1510 ) 1511 self.data = collections.defaultdict(set) 1513 for rec in oma_data:

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/inputs/oma.py:125, in oma_orthologs(organism_a, organism_b, id_type, rel_type, score, return_df) 119 if ( 120 (score and rec['score'] < score) or 121 (rel_type and rec['rel_type'] not in reltype) 122 ): 123 continue --> 125 a, b = ( 126 [ 127 OmaGene( 128 id = id, 129 oma_group = e['oma_group'], 130 hog = e['oma_hog_id'], 131 taxon = e['species']['taxon_id'], 132 chr = e['chromosome'], 133 start = int(e['locus']['start']), 134 end = int(e['locus']['end']), 135 strand = int(e['locus']['strand']), 136 main_isoform = e['is_mainisoform'], 137 138 ) 139 for id in _idtranslate( 140 id = e['canonicalid'], 141 taxon = e['species']['taxon_id'], 142 id_type = idtype, 143 ) 144 ] 145 for e in (rec[f'entry{ei}'] for ei in (1, 2)) 146 ) 149 result.update( 150 { 151 OmaOrthology( (...) 159 } 160 ) 162 if page > n_pages: break

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/inputs/oma.py:139, in (.0) 119 if ( 120 (score and rec['score'] < score) or 121 (rel_type and rec['rel_type'] not in reltype) 122 ): 123 continue 125 a, b = ( 126 [ 127 OmaGene( 128 id = id, 129 oma_group = e['oma_group'], 130 hog = e['oma_hog_id'], 131 taxon = e['species']['taxon_id'], 132 chr = e['chromosome'], 133 start = int(e['locus']['start']), 134 end = int(e['locus']['end']), 135 strand = int(e['locus']['strand']), 136 main_isoform = e['is_mainisoform'], 137 138 ) --> 139 for id in _idtranslate( 140 id = e['canonicalid'], 141 taxon = e['species']['taxon_id'], 142 id_type = idtype, 143 ) 144 ] 145 for e in (rec[f'entry{ei}'] for ei in (1, 2)) 146 ) 149 result.update( 150 { 151 OmaOrthology( (...) 159 } 160 ) 162 if page > n_pages: break

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/inputs/oma.py:244, in _idtranslate(id, taxon, id_type) 234 if not idtype: return {id} 236 s_idtype = ( 237 'ensg' 238 if id.startswith('ENS') else (...) 241 'uniprot' 242 ) --> 244 uniprots = mapping.mapname( 245 id, 246 s_id_type, 247 'uniprot', 248 ncbi_tax_id = taxon, 249 ) 251 return mapping.map_names( 252 uniprots, 253 'uniprot', 254 id_type, 255 ncbi_tax_id = taxon, 256 ) if uniprots else set()

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/mapping.py:3551, in map_name(name, id_type, target_id_type, ncbi_tax_id, strict, expand_complexes, uniprot_cleanup) 3498 """ 3499 Translates one instance of one ID type to a different one. 3500 Returns set of the target ID type. (...) 3546 ID, call the uniprot_cleanup function at the end. 3547 """ 3549 mapper = get_mapper() -> 3551 return mapper.map_name( 3552 name = name, 3553 id_type = id_type, 3554 target_id_type = target_id_type, 3555 ncbi_tax_id = ncbi_tax_id, 3556 strict = strict, 3557 expand_complexes = expand_complexes, 3558 uniprot_cleanup = uniprot_cleanup, 3559 )

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath_common/_misc.py:2936, in ignore_unhashable..wrapper(*args, kwargs) 2933 @functools.wraps(func, assigned = attributes) 2934 def wrapper(*args, *kwargs): 2935 try: -> 2936 return func(args, kwargs) 2937 except TypeError as error: 2938 if 'unhashable type' in str(error):

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/mapping.py:1978, in Mapper.map_name(self, name, id_type, target_id_type, ncbi_tax_id, strict, expand_complexes, uniprot_cleanup) 1964 mapped_names = self.chain_map( 1965 name = name, 1966 id_type = id_type, (...) 1972 uniprot_cleanup = uniprot_cleanup, 1973 ) 1975 else: 1976 1977 # all the other ID types -> 1978 mapped_names = self._map_name( 1979 name = name, 1980 id_type = id_type, 1981 target_id_type = target_id_type, 1982 ncbi_tax_id = ncbi_tax_id, 1983 ) 1985 # as ID translation tables for PRO IDs are not organism specific 1986 # we need an extra step to limit the results to the target organism 1987 if id_type == 'pro' and target_id_type == 'uniprot':

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/mapping.py:2510, in Mapper._map_name(self, name, id_type, target_id_type, ncbi_tax_id) 2503 """ 2504 Once we have defined the name type and the target name type, 2505 this function looks it up in the most suitable dictionary. 2506 """ 2508 ncbi_tax_id = ncbi_tax_id or self.ncbi_tax_id -> 2510 tbl = self.which_table( 2511 id_type, 2512 target_id_type, 2513 ncbi_tax_id = ncbi_tax_id, 2514 ) 2516 return tbl[name] if tbl else set()

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/mapping.py:1562, in Mapper.which_table(self, id_type, target_id_type, load, ncbi_tax_id) 1551 if resource: 1553 self._log( 1554 'Chosen built-in defined ID translation table: ' 1555 'resource=%s, id_type_a=%s, id_type_b=%s' % ( (...) 1559 ) 1560 ) -> 1562 self.load_mapping( 1563 resource = resource, 1564 load_a_to_b = load_a_to_b, 1565 load_b_to_a = load_b_to_a, 1566 ncbi_tax_id = ncbi_tax_id, 1567 ) 1569 tbl = check_loaded() 1571 break

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/mapping.py:3208, in Mapper.load_mapping(self, resource, kwargs) 3195 ncbi_tax_id = kwargs.get('ncbi_tax_id', resource.ncbi_tax_id) 3197 self._log( 3198 'Loading mapping table for organism %s ' 3199 'with identifiers %s and %s, ' (...) 3205 ) 3206 ) -> 3208 reader = MapReader(param = resource, kwargs) 3210 a_to_b = reader.mapping_table_a_to_b 3211 b_to_a = reader.mapping_table_b_to_a

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/mapping.py:258, in MapReader.init(self, param, ncbi_tax_id, entity_type, load_a_to_b, load_b_to_a, uniprots, lifetime, resource_id_types) 255 self.uniprots = uniprots 256 self._resource_id_types = resource_id_types --> 258 self.load()

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/mapping.py:288, in MapReader.load(self) 283 self.read_cache() 285 if not self.tables_loaded(): 286 287 # read from the original source --> 288 self.read() 290 if self.tables_loaded(): 291 292 # write cache only at successful loading 293 self.write_cache()

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/mapping.py:450, in MapReader.read(self) 446 method = 'readmapping%s' % self.source_type 448 if hasattr(self, method): --> 450 getattr(self, method)()

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/utils/mapping.py:893, in MapReader.read_mapping_uniprot(self) 891 protein_name = self.param.field == 'protein names' 892 query.name_process = not protein_name and not trembl --> 893 data = query.perform() 895 if not query.name_process: 897 def maybe_split(v):

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/inputs/uniprot.py:681, in UniprotQuery.perform(self) 669 def perform(self) -> list[str] | dict[str, str] | dict[str, dict[str, str]]: 670 """ 671 Perform the query and preprocess the result. 672 (...) 678 kind described in the previous point as values. 679 """ --> 681 _id, variables = zip(self) 682 _id = list(map(common.sfirst, _id)) 684 if variables:

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/inputs/uniprot.py:660, in UniprotQuery.iter(self) 657 _proc0 = functools.partial(self._FIELDEND.sub, '') 658 _proc1 = self._FIELDSEP.split if self.name_process else common.identity --> 660 for line in result: 662 line = line.strip('\n\r') 664 if line.strip():

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/site-packages/pypath/share/curl.py:766, in FileOpener.iterfile(fileobj) 763 @staticmethod 764 def iterfile(fileobj): --> 766 for line in fileobj: 768 yield line

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/gzip.py:313, in GzipFile.read1(self, size) 311 if size < 0: 312 size = io.DEFAULT_BUFFER_SIZE --> 313 return self._buffer.read1(size)

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/_compression.py:68, in DecompressReader.readinto(self, b) 66 def readinto(self, b): 67 with memoryview(b) as view, view.cast("B") as byte_view: ---> 68 data = self.read(len(byte_view)) 69 byte_view[:len(data)] = data 70 return len(data)

File /opt/homebrew/Caskroom/miniconda/base/envs/decoupler_env/lib/python3.9/gzip.py:495, in _GzipReader.read(self, size) 492 # Read a chunk of data from the file 493 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) --> 495 uncompress = self._decompressor.decompress(buf, size) 496 if self._decompressor.unconsumed_tail != b"": 497 self._fp.prepend(self._decompressor.unconsumed_tail)

error: Error -3 while decompressing data: too many length or distance symbols

Thank you!!

wgsim commented 1 month ago

Hi @deeenes, I found out how to solve my issue. I needed to clear the omnipathdb cache and pypath cache which I could find its path by "from pypath.share import cache; cache.get_cachedir()". Sorry to bother you and thank you!