sorgerlab / indra

INDRA (Integrated Network and Dynamical Reasoning Assembler) is an automated model assembly system interfacing with NLP systems and databases to collect knowledge, and through a process of assembly, produce causal graphs and dynamical models.
http://indra.bio
BSD 2-Clause "Simplified" License
171 stars 65 forks source link

CausalBioNet processing - KeyError while processing jgif file #1420

Closed kkaris closed 9 months ago

kkaris commented 11 months ago

The following KeyError for the selventa_lookup dictionary is produced:

KeyError: ('SCHEM', 'nano-sized particulate matter (nPM)')

When running this code:

import requests
from zipfile import ZipFile
from indra.sources.bel.api import process_cbn_jgif_file
import tempfile
import os

archive_url = 'https://www.causalbionet.com/Content/jgf_bulk_files/Human-2.0.zip'
cbn_dir = tempfile.mkdtemp('cbn_manager')
tmp_zip = os.path.join(cbn_dir, 'cbn_human.zip')
resp = requests.get(archive_url)
with open(tmp_zip, 'wb') as f:
    f.write(resp.content)

stmts = []
tmp_dir = os.path.join(cbn_dir, 'cbn')
os.mkdir(tmp_dir)
with ZipFile(tmp_zip) as zipf:
    zipf.extractall(path=tmp_dir)
    for jgif in zipf.namelist():
        if jgif.endswith('.jgf') or jgif.endswith('.jgif'):
            pbp = process_cbn_jgif_file(os.path.join(tmp_dir, jgif))
            stmts += pbp.statements

The full error output:

File ~/repos/indra/indra/sources/bel/api.py:270, in process_cbn_jgif_file(file_name)
    256 """Return a PybelProcessor by processing a CBN JGIF JSON file.
    257 
    258 Parameters
   (...)
    267     bp.statements.
    268 """
    269 with open(file_name, 'r') as jgf:
--> 270     return process_pybel_graph(pybel.from_cbn_jgif(json.load(jgf)))

File ~/repos/indra/indra/sources/bel/api.py:197, in process_pybel_graph(graph)
    183 """Return a PybelProcessor by processing a PyBEL graph.
    184 
    185 Parameters
   (...)
    194     bp.statements.
    195 """
    196 bp = PybelProcessor(graph)
--> 197 bp.get_statements()
    198 if bp.annot_manager.failures:
    199     logger.warning('missing %d annotation pairs',
    200                    sum(len(v)
    201                        for v in bp.annot_manager.failures.values()))

File ~/repos/indra/indra/sources/bel/processor.py:156, in PybelProcessor.get_statements(self)
    149         self._get_gef_gap(u_data, v_data, k, d)
    150     # Activation/Inhibition
    151     #   x(Foo) -> act(x(Foo))
    152     #   act(x(Foo)) -> act(x(Foo))
    153     # GtpActivation
    154     #   gtp(p(Foo)) => act(p(Foo))
    155     else:
--> 156         self._get_regulate_activity(u_data, v_data, k, d)
    157 # Activations involving biological processes or pathologies
    158 #   x(Foo) -> bp(Bar)
    159 elif isinstance(v_data, (dsl.BiologicalProcess, dsl.Pathology)):

File ~/repos/indra/indra/sources/bel/processor.py:251, in PybelProcessor._get_regulate_activity(self, u_data, v_data, k, edge_data)
    249 def _get_regulate_activity(self, u_data, v_data, k, edge_data):
    250     # Subject info
--> 251     subj_agent = get_agent(u_data, edge_data.get(pc.SOURCE_MODIFIER))
    252     subj_activity = _get_activity_condition(edge_data.get(pc.SOURCE_MODIFIER))
    253     # Object info
    254     # Note: Don't pass the object modifier data because we don't want to
    255     # put an activity on the agent

File ~/repos/indra/indra/sources/bel/processor.py:432, in get_agent(node_data, node_modifier_data)
    430 if not ident:
    431     assert name, "Node must have a name if lacking an identifier."
--> 432     name, db_refs = get_db_refs_by_name(ns, name, node_data)
    433 # We've already got an identifier, look up other identifiers if necessary
    434 else:
    435     name, db_refs = get_db_refs_by_ident(ns, ident, node_data)

File ~/repos/indra/indra/sources/bel/processor.py:617, in get_db_refs_by_name(ns, name, node_data)
    615 # SDIS, SCHEM: Look up the ID and include it in the db_refs
    616 elif ns in {'SDIS', 'SCHEM'}:
--> 617     sid, xrefs = selventa_lookup[(ns, name)]
    618     db_refs = xrefs.copy()
    619     db_refs[ns] = sid

KeyError: ('SCHEM', 'nano-sized particulate matter (nPM)')
kkaris commented 11 months ago

Catching the above error leads to another error:

File ~/repos/indra_db/indra_db/cli/knowledgebase.py:189, in CBNManager.get_statements(self)
    187     for jgif in tqdm(zipf.namelist()):
    188         if jgif.endswith('.jgf') or jgif.endswith('.jgif'):
--> 189             pbp = process_cbn_jgif_file(os.path.join(tmp_dir, jgif))
    190             stmts += pbp.statements
    192 uniques, dups = extract_duplicates(stmts,
    193                                    key_func=KeyFunc.mk_and_one_ev_src)

File ~/repos/indra/indra/sources/bel/api.py:270, in process_cbn_jgif_file(file_name)
    256 """Return a PybelProcessor by processing a CBN JGIF JSON file.
    257 
    258 Parameters
   (...)
    267     bp.statements.
    268 """
    269 with open(file_name, 'r') as jgf:
--> 270     return process_pybel_graph(pybel.from_cbn_jgif(json.load(jgf)))

File ~/repos/indra/indra/sources/bel/api.py:197, in process_pybel_graph(graph)
    183 """Return a PybelProcessor by processing a PyBEL graph.
    184 
    185 Parameters
   (...)
    194     bp.statements.
    195 """
    196 bp = PybelProcessor(graph)
--> 197 bp.get_statements()
    198 if bp.annot_manager.failures:
    199     logger.warning('missing %d annotation pairs',
    200                    sum(len(v)
    201                        for v in bp.annot_manager.failures.values()))

File ~/repos/indra/indra/sources/bel/processor.py:156, in PybelProcessor.get_statements(self)
    149         self._get_gef_gap(u_data, v_data, k, d)
    150     # Activation/Inhibition
    151     #   x(Foo) -> act(x(Foo))
    152     #   act(x(Foo)) -> act(x(Foo))
    153     # GtpActivation
    154     #   gtp(p(Foo)) => act(p(Foo))
    155     else:
--> 156         self._get_regulate_activity(u_data, v_data, k, d)
    157 # Activations involving biological processes or pathologies
    158 #   x(Foo) -> bp(Bar)
    159 elif isinstance(v_data, (dsl.BiologicalProcess, dsl.Pathology)):

File ~/repos/indra/indra/sources/bel/processor.py:256, in PybelProcessor._get_regulate_activity(self, u_data, v_data, k, edge_data)
    252 subj_activity = _get_activity_condition(edge_data.get(pc.SOURCE_MODIFIER))
    253 # Object info
    254 # Note: Don't pass the object modifier data because we don't want to
    255 # put an activity on the agent
--> 256 obj_agent = get_agent(v_data, None)
    257 # If it's a bioprocess object, we won't have an activity in the edge
    258 if isinstance(v_data, (dsl.BiologicalProcess, dsl.Pathology)):

File ~/repos/indra/indra/sources/bel/processor.py:432, in get_agent(node_data, node_modifier_data)
    430 if not ident:
    431     assert name, "Node must have a name if lacking an identifier."
--> 432     name, db_refs = get_db_refs_by_name(ns, name, node_data)
    433 # We've already got an identifier, look up other identifiers if necessary
    434 else:
    435     name, db_refs = get_db_refs_by_ident(ns, ident, node_data)

File ~/repos/indra/indra/sources/bel/processor.py:550, in get_db_refs_by_name(ns, name, node_data)
    548 # Map Selventa families and complexes to FamPlex
    549 elif ns == 'SFAM':
--> 550     sfam_id, xrefs = selventa_lookup[('SFAM', name)]
    551     db_refs = {'SFAM': sfam_id}
    552     indra_name = bel_to_indra.get(name)

KeyError: ('SFAM', 'p-nitrophenol hydroxylase')
kkaris commented 11 months ago

Catching the KeyErrors and setting db_refs to None resolved the errors: https://github.com/sorgerlab/indra/pull/1423/commits/e9ab61aa9edfacb94416f557a3c53c3ee24dde6a

kkaris commented 9 months ago

Will be closed when #1423 is merged.