Open soldni opened 1 year ago
Fails if PDF contains an empty page (e.g., s3://ai2-s2-pdfs/aaaa/13131d475c6b202b7607c1c1336cd08b8f34.pdf
)
File /opt/miniconda3/lib/python3.9/site-packages/mmda/types/document.py:175, in Document._annotate_box_group(self, box_groups, field_name)
168 for box in box_group.boxes:
169
170 # Caching the page tokens to avoid duplicated search
171 if box.page not in all_page_tokens:
172 cur_page_tokens = all_page_tokens[box.page] = list(
173 itertools.chain.from_iterable(
174 span_group.spans
--> 175 for span_group in self.pages[box.page].tokens
176 )
177 )
178 else:
179 cur_page_tokens = all_page_tokens[box.page]
IndexError: list index out of range
Fails if PDF has non pdfplumber rows (e.g., s3://ai2-s2-pdfs/aaaa/08cd21552b21cfb7f92e2fd9c195881f0048.pdf
)
IndexError Traceback (most recent call last)
Cell In[15], line 1
----> 1 recipe.from_path('/home/lucas/aaaa/08cd21552b21cfb7f92e2fd9c195881f0048.pdf')
File /opt/miniconda3/lib/python3.9/site-packages/mmda/recipes/core_recipe.py:41, in CoreRecipe.from_path(self, pdfpath)
39 def from_path(self, pdfpath: str) -> Document:
40 logger.info("Parsing document...")
---> 41 doc = self.parser.parse(input_pdf_path=pdfpath)
43 logger.info("Rasterizing document...")
44 images = self.rasterizer.rasterize(input_pdf_path=pdfpath, dpi=72)
File /opt/miniconda3/lib/python3.9/site-packages/mmda/parsers/pdfplumber_parser.py:231, in PDFPlumberParser.parse(self, input_pdf_path)
227 all_tokens.extend(fine_tokens)
228 all_row_ids.extend(
229 [i + last_row_id + 1 for i in line_ids_of_fine_tokens]
230 )
--> 231 last_row_id = all_row_ids[-1]
232 all_word_ids.extend(
233 [i + last_word_id + 1 for i in word_ids_of_fine_tokens]
234 )
235 last_word_id = all_word_ids[-1]
IndexError: list index out of range
Note that this case happens a lot with PDFs that are scans, contain no text, etc. stuff that it is definitely not an academic paper.
Fails on corrupted files (e.g., s3://ai2-s2-pdfs/aaaa/10da16bdac973e062e02f8d1d2c927b84193.pdf
)
File /opt/miniconda3/lib/python3.9/site-packages/pdfplumber/pdf.py:40, in PDF.__init__(self, stream, stream_is_external, pages, laparams, password, strict_metadata)
37 self.pages_to_parse = pages
38 self.laparams = None if laparams is None else LAParams(**laparams)
---> 40 self.doc = PDFDocument(PDFParser(stream), password=password)
41 self.rsrcmgr = PDFResourceManager()
42 self.metadata = {}
File /opt/miniconda3/lib/python3.9/site-packages/pdfminer/pdfdocument.py:752, in PDFDocument.__init__(self, parser, password, caching, fallback)
750 break
751 else:
--> 752 raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
753 if self.catalog.get("Type") is not LITERAL_CATALOG:
754 if settings.STRICT:
PDFSyntaxError: No /Root object! - Is this really a PDF?
We should fail safe on this, maybe return empty doc?
There's a doc with 11,000+ pages, rasterization hungs up for a very long time: s3://ai2-s2-pdfs/aaaa/1936f84cca454ff18c99d17b1a30eda9c1db.pdf
@kyleclo mentioned to look at ways we can just parse metadata, and skip if very large pdf.
VILA fails if no blocks are predicted (e.g., in s3://ai2-s2-pdfs/aaaa/1866d0aeea127176cf112c9804e9f819546b.pdf
)
File /opt/miniconda3/lib/python3.9/site-packages/mmda/predictors/hf_predictors/utils.py:43, in get_visual_group_id(token, field_name, defaults)
42 def get_visual_group_id(token: SpanGroup, field_name: str, defaults=-1) -> int:
---> 43 if not hasattr(token, field_name):
44 return defaults
45 field_value = getattr(token, field_name)
File /opt/miniconda3/lib/python3.9/site-packages/mmda/types/annotation.py:72, in Annotation.__getattr__(self, field)
69 raise ValueError("This annotation is not attached to a document")
71 if field in self.doc.fields:
---> 72 return self.doc.find_overlapping(self, field)
74 if field in self.doc.fields:
75 return self.doc.find_overlapping(self, field)
File /opt/miniconda3/lib/python3.9/site-packages/mmda/types/document.py:41, in Document.find_overlapping(self, query, field_name)
37 if not isinstance(query, SpanGroup):
38 raise NotImplementedError(
39 f"Currently only supports query of type SpanGroup"
40 )
---> 41 return self.__indexers[field_name].find(query=query)
KeyError: 'blocks'
Another pdfplumber row error on document s3://ai2-s2-pdfs/aaaa/1804144b403e699b1ef5ee839dc9b8bbf4c2.pdf
. @kyleclo mentioned that this PDF looks like something that could be part of a reproducibility checklist, so we should fail more gracefully on it.
Word predictor failed on s3://ai2-s2-pdfs/aaaa/25562b11f72bc76c0d0f7efdb417355fc6e2.pdf
@kyleclo mentioned maybe we don't run word predictor.
File /opt/miniconda3/lib/python3.9/site-packages/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py:150, in DictionaryWordPredictor.predict(self, document)
142 internal_dictionary = self._build_internal_dictionary(
143 document=document,
144 token_id_to_token_ids=token_id_to_token_ids,
145 row_start_after_hyphen_token_ids=row_start_after_hyphen_token_ids,
146 row_end_with_hyphen_token_ids=row_end_with_hyphen_token_ids
147 )
149 # 4) predict words for using token features
--> 150 token_id_to_word_id, word_id_to_text = self._predict_tokens(
151 document=document,
152 internal_dictionary=internal_dictionary,
153 token_id_to_token_ids=token_id_to_token_ids,
154 row_start_after_hyphen_token_ids=row_start_after_hyphen_token_ids,
155 row_end_with_hyphen_token_ids=row_end_with_hyphen_token_ids,
156 max_row_end_token_id_to_min_row_start_token_id=max_row_end_token_id_to_min_row_start_token_id,
157 punct_r_strip_candidate_token_ids=punct_r_strip_candidate_token_ids,
158 punct_l_strip_candidate_token_ids=punct_l_strip_candidate_token_ids
159 )
161 # 5) transformation
162 words: List[SpanGroup] = self._convert_to_words(
163 document=document,
164 token_id_to_word_id=token_id_to_word_id,
165 word_id_to_text=word_id_to_text
166 )
File /opt/miniconda3/lib/python3.9/site-packages/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py:445, in DictionaryWordPredictor._predict_tokens(self, document, internal_dictionary, token_id_to_token_ids, row_start_after_hyphen_token_ids, row_end_with_hyphen_token_ids, max_row_end_token_id_to_min_row_start_token_id, punct_r_strip_candidate_token_ids, punct_l_strip_candidate_token_ids)
442 if token.id in punct_l_strip_candidate_token_ids:
443 # capture current state, before fixing
444 word_id = token_id_to_word_id[token.id]
--> 445 word_text = word_id_to_text[word_id]
446 other_same_word_token_ids = [
447 i for i in word_id_to_token_ids[token_id_to_word_id[token.id]]
448 if token_id_to_word_id[i] == word_id and i != token.id
449 ]
450 new_first_token_id = min(other_same_word_token_ids)
KeyError: None
It is a scan of an old doc, maybe something to do with weird characters?
Another error in word predictor (s3://ai2-s2-pdfs/aaaa/1373919748374a915e85d018af472151dd99.pdf
)
ry_word_predictor.py:150, in DictionaryWordPredictor.predict(self, document)
142 internal_dictionary = self._build_internal_dictionary(
143 document=document,
144 token_id_to_token_ids=token_id_to_token_ids,
145 row_start_after_hyphen_token_ids=row_start_after_hyphen_token_ids,
146 row_end_with_hyphen_token_ids=row_end_with_hyphen_token_ids
147 )
149 # 4) predict words for using token features
--> 150 token_id_to_word_id, word_id_to_text = self._predict_tokens(
151 document=document,
152 internal_dictionary=internal_dictionary,
153 token_id_to_token_ids=token_id_to_token_ids,
154 row_start_after_hyphen_token_ids=row_start_after_hyphen_token_ids,
155 row_end_with_hyphen_token_ids=row_end_with_hyphen_token_ids,
156 max_row_end_token_id_to_min_row_start_token_id=max_row_end_token_id_to_min_row_start_token_id,
157 punct_r_strip_candidate_token_ids=punct_r_strip_candidate_token_ids,
158 punct_l_strip_candidate_token_ids=punct_l_strip_candidate_token_ids
159 )
161 # 5) transformation
162 words: List[SpanGroup] = self._convert_to_words(
163 document=document,
164 token_id_to_word_id=token_id_to_word_id,
165 word_id_to_text=word_id_to_text
166 )
File /opt/miniconda3/lib/python3.9/site-packages/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py:487, in DictionaryWordPredictor._predict_tokens(self, document, internal_dictionary, token_id_to_token_ids, row_start_after_hyphen_token_ids, row_end_with_hyphen_token_ids, max_row_end_token_id_to_min_row_start_token_id, punct_r_strip_candidate_token_ids, punct_l_strip_candidate_token_ids)
460 del word_id_to_token_ids
462 # edge case handling. there are cases (e.g. tables) where each cell is detected as its own
463 # row. This is super annoying but *shrug*. In these cases, a cell "-" followed by another
464 # cell "48.9" can be represented as 2 adjacent rows. This can cause the token for "48"
(...)
485
486 # are there any unclassified tokens?
--> 487 assert None not in token_id_to_word_id.values()
488 return token_id_to_word_id, word_id_to_text
AssertionError:
PDF looks ok?
Final stats on the sample I ran on
elapsed time: 3162.67 seconds
failures: 41
correct: 293
All PDFs that failed
s3://ai2-s2-pdfs/aaaa/1866d0aeea127176cf112c9804e9f819546b.pdf
s3://ai2-s2-pdfs/aaaa/1804144b403e699b1ef5ee839dc9b8bbf4c2.pdf
s3://ai2-s2-pdfs/aaaa/07831b64ffe12858fa9ac9da2d050fd52fbf.pdf
s3://ai2-s2-pdfs/aaaa/02b7ac88bc3ace43020092d2741f75be39fc.pdf
s3://ai2-s2-pdfs/aaaa/0acfacab1690ff81d20b8b760d818884116b.pdf
s3://ai2-s2-pdfs/aaaa/1c9464c859d4b42371a34283135dc5902955.pdf
s3://ai2-s2-pdfs/aaaa/00cbc67b385760639074f29aa8c71a7fd0b4.pdf
s3://ai2-s2-pdfs/aaaa/241c0e4af4a7825e9d5fe9ec43331b383b7e.pdf
s3://ai2-s2-pdfs/aaaa/26bfd01b42b0371ef4a01f02c47e420b2384.pdf
s3://ai2-s2-pdfs/aaaa/2148c342b40600f8af45eeeb2ce42a0a20a2.pdf
s3://ai2-s2-pdfs/aaaa/24376f25b07599e138789f9a1ae32b62b8d7.pdf
s3://ai2-s2-pdfs/aaaa/25562b11f72bc76c0d0f7efdb417355fc6e2.pdf
s3://ai2-s2-pdfs/aaaa/2184ea74779be5173c72e96c2747f70be8a8.pdf
s3://ai2-s2-pdfs/aaaa/1373919748374a915e85d018af472151dd99.pdf
s3://ai2-s2-pdfs/aaaa/10087044c92b9f6edaa6843e669762335704.pdf
s3://ai2-s2-pdfs/aaaa/202be4542434c5a07a558b19a37d1e79f285.pdf
s3://ai2-s2-pdfs/aaaa/139908b58091c066a4dddf5651c576d9118f.pdf
s3://ai2-s2-pdfs/aaaa/1d8cf85d61aa64a3e5773644597c43450591.pdf
s3://ai2-s2-pdfs/aaaa/16dbd7eaacaf0147ce16a454348cb9e3dee0.pdf
s3://ai2-s2-pdfs/aaaa/1c4a5f9f62db7fced574d9f2b7eb3f2f5e87.pdf
s3://ai2-s2-pdfs/aaaa/0fbcb30c2e86fc00d4070eef8675bd4e5871.pdf
s3://ai2-s2-pdfs/aaaa/18a1f90cd61f5db25a75f3b080863457006a.pdf
s3://ai2-s2-pdfs/aaaa/08ae3b4b8d557a65553bdfdb66c5538b6717.pdf
s3://ai2-s2-pdfs/aaaa/0855b08b07c86cb251cd11f627a2cd459948.pdf
s3://ai2-s2-pdfs/aaaa/2066e18ca61ff80dd5f3957a12fecb667e46.pdf
s3://ai2-s2-pdfs/aaaa/066055a5dded74117de0f9947c8af52319c9.pdf
s3://ai2-s2-pdfs/aaaa/0a89df640bb5b667cf66a12b2db35632518e.pdf
s3://ai2-s2-pdfs/aaaa/110ec91b0d97206016ed56857370b947bd6a.pdf
s3://ai2-s2-pdfs/aaaa/01fa7d1fa51d597f18790820db9814486b8b.pdf
s3://ai2-s2-pdfs/aaaa/268a8beca47f1374dc9db4f677e1607f85da.pdf
s3://ai2-s2-pdfs/aaaa/19a0bde07062a3c98129d0046100db9a709e.pdf
s3://ai2-s2-pdfs/aaaa/1fa5190b1003cddd234ca03f4b2519e14612.pdf
s3://ai2-s2-pdfs/aaaa/097240cbe2f92aba81caa074ea48aebb2c72.pdf
s3://ai2-s2-pdfs/aaaa/260189000a8897e40a0b0f6ac7ab6aeefc97.pdf
s3://ai2-s2-pdfs/aaaa/24fe8d37d2cebe8cb03663a7e6a32971e418.pdf
s3://ai2-s2-pdfs/aaaa/253b636bd6fca1a0c9908421b27223dacff0.pdf
s3://ai2-s2-pdfs/aaaa/1a11c24dd5b307036ce16626aa1e2b9a9d1d.pdf
s3://ai2-s2-pdfs/aaaa/0b2ca104ff0277b27e55672991b71dcdad26.pdf
s3://ai2-s2-pdfs/aaaa/2144886fcf5adf0f926abb4453e3bb7c4d9a.pdf
s3://ai2-s2-pdfs/aaaa/205d8808dad458a3fadf4f31d46aab7990e0.pdf
s3://ai2-s2-pdfs/aaaa/223956c4efc6accb09af724be3ac84770eac.pdf
The following snipped from paper aaaa07831b64ffe12858fa9ac9da2d050fd52fbf
contains \n
characters between tokens instead of
:
ipdb> document.symbols[17650:17800]
' RF compatibility needs.\nTable 2.\nSpectral\nCorrelation\nCoefficients\nof\nBOC\nModulations\nSub-\nSpread-\nSSC with\nSSC with carrier\ning Code\nC/A\nSSC with\nM '
which causes word splitter to fail.
Using this issue to document failures when running of ~700 PDFs