from mmda.recipes.core_recipe import CoreRecipe
file_name = 'a85f7a895ed9cbe09a90b8b449ad7356fb92de6a.pdf'
doc = recipe_doc.from_path(file_name)
Stack trace:
ile ~/Documents/codes/git/ai2/s2/mmda/src/mmda/recipes/core_recipe.py:56, in CoreRecipe.from_path(self, pdfpath)
53 equations = self.effdet_mfd_predictor.predict(document=doc)
55 # we annotate layout info in the document
---> 56 doc.annotate(layout=layout)
58 # list annotations separately
59 doc.annotate(equations=equations)
File ~/Documents/codes/git/ai2/s2/mmda/src/mmda/types/document.py:97, in Document.annotate(self, is_overwrite, **kwargs)
91 span_groups = self._annotate_span_group(
92 span_groups=annotations, field_name=field_name
93 )
94 elif annotation_type == BoxGroup:
95 # TODO: not good. BoxGroups should be stored on their own, not auto-generating SpanGroups.
96 span_groups = self._annotate_span_group(
---> 97 span_groups=box_groups_to_span_groups(annotations, self), field_name=field_name
98 )
99 else:
100 raise NotImplementedError(
101 f"Unsupported annotation type {annotation_type} for {field_name}"
102 )
File ~/Documents/codes/git/ai2/s2/mmda/src/mmda/utils/tools.py:70, in box_groups_to_span_groups(box_groups, doc, pad_x, center)
66 for box in box_group.boxes:
67
68 # Caching the page tokens to avoid duplicated search
69 if box.page not in all_page_tokens:
---> 70 cur_page_tokens = all_page_tokens[box.page] = doc.pages[
71 box.page
72 ].tokens
73 if token_box_in_box_group is None:
74 # Determine whether box is stored on token SpanGroup span.box or in the box_group
75 token_box_in_box_group = all(
76 [
77 (
(...)
82 ]
83 )
IndexError: list index out of range
It appears as the doc has less number of pages than box_groups, e.g.
Here is the code to reproduce the error
Stack trace:
It appears as the doc has less number of pages than box_groups, e.g.
ipdb> set([box.page for box_group in box_groups for box in box_group]) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36} ipdb> len(doc.pages) 35
doc.pages misses some of the pages it appears
List of shas: 736aea59f4c4d6d52ffe5a5ffabc6f734e142239, a85f7a895ed9cbe09a90b8b449ad7356fb92de6a, 0197e4b6a68e920019b3bb2ae2acde6b61eb96c5
More error can be found in this datadog log