Closed particitae closed 1 year ago
That's a broken XML file (specifically an invalid custom string in one of the text lines). I've pushed a commit so the code path also catches ValueErrors so it should produce a log message now instead of just crashing.
I launch "ketos compile -f page -o 2023-04-18.arrow -F data/allXml.txt --workers 24 > debug.log 2>&1" and i got this error WARNING Region eScdummyblock without coordinates xml.py:242 Extracting lines ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% 0/0 -:--:-- -:--:-- ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/particitae/Kraken-Python3.10/bin/ketos:10 in │
│ │
│ 7 │
│ 8 │
│ 9 if name == "main": │
│ ❱ 10 │ sys.exit(cli()) │
│ 11 │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/click/core.py:1130 in │
│ call │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/click/core.py:1055 in main │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/click/core.py:1657 in invoke │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/click/core.py:1404 in invoke │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/click/core.py:760 in invoke │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/click/decorators.py:26 in │
│ new_func │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/kraken/ketos/dataset.py:84 │
│ in compile │
│ │
│ 81 │ with KrakenProgressBar() as progress: │
│ 82 │ │ extract_task = progress.add_task('Extracting lines', total=0, start=False, visib │
│ 83 │ │ │
│ ❱ 84 │ │ arrow_dataset.build_binary_dataset(ground_truth, │
│ 85 │ │ │ │ │ │ │ │ │ │ output, │
│ 86 │ │ │ │ │ │ │ │ │ │ format_type, │
│ 87 │ │ │ │ │ │ │ │ │ │ workers, │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/kraken/lib/arrow_dataset.py: │
│ 161 in build_binary_dataset │
│ │
│ 158 │ if parse_fn: │
│ 159 │ │ for doc in files: │
│ 160 │ │ │ try: │
│ ❱ 161 │ │ │ │ data = parse_fn(doc) │
│ 162 │ │ │ except (FileNotFoundError, KrakenInputException): │
│ 163 │ │ │ │ logger.warning(f'Invalid input file {doc}') │
│ 164 │ │ │ │ continue │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/kraken/lib/xml.py:248 in │
│ parse_page │
│ │
│ 245 │ │ │ # parse transkribus-style custom field if possible │
│ 246 │ │ │ custom_str = region.get('custom') │
│ 247 │ │ │ if not rtype and custom_str: │
│ ❱ 248 │ │ │ │ cs = _parse_page_custom(custom_str) │
│ 249 │ │ │ │ if 'structure' in cs and 'type' in cs['structure']: │
│ 250 │ │ │ │ │ rtype = cs['structure']['type'] │
│ 251 │ │ │ # fall back to default region type if nothing is given │
│ │
│ /home/particitae/Kraken-Python3.10/lib/python3.10/site-packages/kraken/lib/xml.py:188 in │
│ _parse_page_custom │
│ │
│ 185 │ │ l_chunks = [l_chunk for l_chunk in s.split('}') if l_chunk.strip()] │
│ 186 │ │ if l_chunks: │
│ 187 │ │ │ for chunk in l_chunks: │
│ ❱ 188 │ │ │ │ tag, vals = chunk.split('{') │
│ 189 │ │ │ │ tag_vals = {} │
│ 190 │ │ │ │ vals = [val.strip() for val in vals.split(';') if val.strip()] │
│ 191 │ │ │ │ for val in vals: │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: not enough values to unpack (expected 2, got 1)
Perhaps there is a Xml file buggy but how can i identify it ? best regards