compartia / nlp_tools

0 stars 0 forks source link

Errors during batch contract parsing #48

Open compartia opened 5 years ago

compartia commented 5 years ago

see dev: Parse contracts and save all to JSONs.ipynb branch: semantic-tags

/content/gdrive/My Drive/GazpromOil/Contracts/Проект договора ООО Чудеса.doc
zero-size array to reduction operation maximum which has no identity
/content/gdrive/My Drive/GazpromOil/Contracts/Доп.согл3 Доверие.doc
zero-size array to reduction operation maximum which has no identity
nlp_tools/integration/doc_providers.py:29: RuntimeWarning: Cannot read file /content/gdrive/My Drive/GazpromOil/Contracts/Проект договора ООО Чудеса.doc with docx2txt, error: (<class 'KeyError'>, KeyError("There is no item named 'word/document.xml' in the archive",), <traceback object at 0x7f88aee33208>)
  warnings.warn(info_, RuntimeWarning)
nlp_tools/integration/doc_providers.py:29: RuntimeWarning: Cannot read file /content/gdrive/My Drive/GazpromOil/Contracts/Доп.согл3 Доверие.doc with docx2txt, error: (<class 'KeyError'>, KeyError("There is no item named 'word/document.xml' in the archive",), <traceback object at 0x7f88aee33188>)
  warnings.warn(info_, RuntimeWarning)
nlp_tools/integration/doc_providers.py:29: RuntimeWarning: Cannot read file /content/gdrive/My Drive/GazpromOil/Contracts/Проект договора ООО Зеленые.doc with docx2txt, error: (<class 'KeyError'>, KeyError("There is no item named 'word/document.xml' in the archive",), <traceback object at 0x7f88aee33188>)
  warnings.warn(info_, RuntimeWarning)
/content/gdrive/My Drive/GazpromOil/Contracts/Проект договора ООО Зеленые.doc
zero-size array to reduction operation maximum which has no identity
nlp_tools/integration/doc_providers.py:29: RuntimeWarning: Cannot read file /content/gdrive/My Drive/GazpromOil/Contracts/Договор купли-продажи_ООО Парацельс.doc with docx2txt, error: (<class 'KeyError'>, KeyError("There is no item named 'word/document.xml' in the archive",), <traceback object at 0x7f88aee33308>)
  warnings.warn(info_, RuntimeWarning)
/content/gdrive/My Drive/GazpromOil/Contracts/Договор купли-продажи_ООО Парацельс.doc
zero-size array to reduction operation maximum which has no identity
nlp_tools/integration/doc_providers.py:29: RuntimeWarning: Cannot read file /content/gdrive/My Drive/GazpromOil/Contracts/Договор хранения нефти_ХранТрансНефть.doc with docx2txt, error: (<class 'KeyError'>, KeyError("There is no item named 'word/document.xml' in the archive",), <traceback object at 0x7f88aee33108>)
  warnings.warn(info_, RuntimeWarning)
/content/gdrive/My Drive/GazpromOil/Contracts/Договор хранения нефти_ХранТрансНефть.doc
zero-size array to reduction operation maximum which has no identity
nlp_tools/integration/doc_providers.py:29: RuntimeWarning: Cannot read file /content/gdrive/My Drive/GazpromOil/Contracts/7. Соглашение БГ Газпромнефть-Региональные продажи.doc with docx2txt, error: (<class 'zipfile.BadZipFile'>, BadZipFile('File is not a zip file',), <traceback object at 0x7f88aee33388>)
  warnings.warn(info_, RuntimeWarning)
/content/gdrive/My Drive/GazpromOil/Contracts/7. Соглашение БГ Газпромнефть-```
Региональные продажи.doc
zero-size array to reduction operation maximum which has no identity
nlp_tools/integration/doc_providers.py:29: RuntimeWarning: Cannot read file /content/gdrive/My Drive/GazpromOil/Contracts/Купля-продажа.doc with docx2txt, error: (<class 'KeyError'>, KeyError("There is no item named 'word/document.xml' in the archive",), <traceback object at 0x7f88aee33348>)
  warnings.warn(info_, RuntimeWarning)
/content/gdrive/My Drive/GazpromOil/Contracts/Купля-продажа.doc
zero-size array to reduction operation maximum which has no identity
/content/gdrive/My Drive/GazpromOil/Contracts/ГПН 3 Договор пожертвования 21.02.2019.doc
zero-size array to reduction operation maximum which has no identity
nlp_tools/integration/doc_providers.py:29: RuntimeWarning: Cannot read file /content/gdrive/My Drive/GazpromOil/Contracts/ГПН 3 Договор пожертвования 21.02.2019.doc with docx2txt, error: (<class 'KeyError'>, KeyError("There is no item named 'word/document.xml' in the archive",), <traceback object at 0x7f88aee33348>)
  warnings.warn(info_, RuntimeWarning)
compartia commented 5 years ago
ValueError                                Traceback (most recent call last)
<ipython-input-13-927075366f6c> in <module>()
     18   short_fn = fn.split('/')[-1]
     19   if 'Договор купли-продажи_ООО Парацельс.doc'==short_fn:
---> 20     contractAnlysingContext.analyze_contract(text)
     21     contract = contractAnlysingContext.contract
     22     contract.filename = fn

7 frames
/content/nlp_tools/contract_parser.py in analyze_contract(self, contract_text)
    100     # create DOC
    101     self.contract = ContractDocument(contract_text)
--> 102     self.contract.parse()
    103 
    104     self._logstep("parsing document 👞 and detecting document high-level structure")

/content/nlp_tools/contract_parser.py in parse(self, txt)
     47 
     48   def parse(self, txt=None):
---> 49     super().parse()
     50     agent_infos = find_org_names_spans(self.tokens_map_norm)
     51     self.agents_tags = agent_infos_to_tags(agent_infos)

/content/nlp_tools/legal_docs.py in parse(self, txt)
    142 
    143     self.structure = DocumentStructure()
--> 144     self.structure.detect_document_structure(self.tokens_map)
    145 
    146   def preprocess_text(self, txt):

/content/nlp_tools/doc_structure.py in detect_document_structure(self, tokens_map)
    357     self.structure = self._fix_structure(structure)
    358 
--> 359     self.headline_indexes = self._find_headlines(tokens_map)
    360     self.headline_indexes = self._merge_headlines_if_underlying_section_is_tiny(self.headline_indexes)
    361 

/content/nlp_tools/doc_structure.py in _find_headlines(self, tokens_map)
    427 
    428     """ 🧠🕺 Magic an Brainfu** inside """
--> 429     _contrasted_probability = self._highlight_headlines_probability(headlines_probability)
    430     headline_indexes = sorted(np.nonzero(_contrasted_probability)[0])
    431 

/content/nlp_tools/doc_structure.py in _highlight_headlines_probability(self, p_per_line)
    440       return delta, blured
    441 
--> 442     max = np.max(p_per_line)
    443     result = relu(p_per_line, max / 3.0)
    444     contrasted, smoothed = local_contrast(result)

/usr/local/lib/python3.6/dist-packages/numpy/core/fromnumeric.py in amax(a, axis, out, keepdims, initial)
   2503     """
   2504     return _wrapreduction(a, np.maximum, 'max', axis, None, out, keepdims=keepdims,
-> 2505                           initial=initial)
   2506 
   2507 

/usr/local/lib/python3.6/dist-packages/numpy/core/fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
     84                 return reduction(axis=axis, out=out, **passkwargs)
     85 
---> 86     return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
     87 
     88 

ValueError: zero-size array to reduction operation maximum which has no identity