since the new version release 1.8.0 we are not able to use the method .to_markdown() method.
The workflow we use is as follows (mainly used for pdfs):
create json by calling call_textract function with features included layout and table
use the json to create "textractor.entities.document.Document"
later we are experimenting with these three mehtods: get_text_from_layout_json() or "textractor.entities.document.Document" .get_text() with configs or document.to_markdown() method
using .to_markdown() method we are getting:
---------------------------------------------------------------------------
UnboundLocalError Traceback (most recent call last)
Cell In[36], line 1
----> 1 print(document.to_markdown())
2 with open('using_markdown_testimage.txt', 'w', encoding = "utf-8") as outfile:
3 outfile.write(document.to_markdown())
File /opt/conda/lib/python3.10/site-packages/textractor/entities/linearizable.py:59, in Linearizable.to_markdown(self, config)
49 def to_markdown(
50 self,
51 config: MarkdownLinearizationConfig = MarkdownLinearizationConfig()
52 ) -> str:
53 """
54 Returns the markdown representation of the entity
55
56 :return: Markdown text of the entity
57 :rtype: str
58 """
---> 59 return self.get_text(config)
File /opt/conda/lib/python3.10/site-packages/textractor/entities/linearizable.py:24, in Linearizable.get_text(self, config)
13 def get_text(
14 self, config: TextLinearizationConfig = TextLinearizationConfig()
15 ) -> str:
16 """
17 Returns the linearized text of the entity
18
(...)
22 :rtype: str
23 """
---> 24 text, _ = self.get_text_and_words(config=config)
25 return text
File /opt/conda/lib/python3.10/site-packages/textractor/entities/document.py:261, in Document.get_text_and_words(self, config)
258 def get_text_and_words(
259 self, config: TextLinearizationConfig = TextLinearizationConfig()
260 ) -> Tuple[str, List]:
--> 261 text, words_lists = zip(*[p.get_text_and_words(config) for p in self.pages])
262 flattened_words = []
263 for words in words_lists:
File /opt/conda/lib/python3.10/site-packages/textractor/entities/document.py:261, in <listcomp>(.0)
258 def get_text_and_words(
259 self, config: TextLinearizationConfig = TextLinearizationConfig()
260 ) -> Tuple[str, List]:
--> 261 text, words_lists = zip(*[p.get_text_and_words(config) for p in self.pages])
262 flattened_words = []
263 for words in words_lists:
File /opt/conda/lib/python3.10/site-packages/textractor/entities/page.py:169, in Page.get_text_and_words(self, config)
166 else:
167 sorted_layouts.append(unsorted_layout)
--> 169 page_texts_and_words = [l.get_text_and_words(config) for l in sorted_layouts]
171 if not page_texts_and_words:
172 return "", []
File /opt/conda/lib/python3.10/site-packages/textractor/entities/page.py:169, in <listcomp>(.0)
166 else:
167 sorted_layouts.append(unsorted_layout)
--> 169 page_texts_and_words = [l.get_text_and_words(config) for l in sorted_layouts]
171 if not page_texts_and_words:
172 return "", []
File /opt/conda/lib/python3.10/site-packages/textractor/entities/layout.py:222, in Layout.get_text_and_words(self, config)
218 final_text = (
219 config.text_prefix + final_text + config.text_suffix
220 )
221 else:
--> 222 final_text, final_words = linearize_children(
223 self.children,
224 config,
225 no_new_lines=False,
226 is_layout_table=self.layout_type == LAYOUT_TABLE,
227 )
229 if config.add_prefixes_and_suffixes_in_text:
230 if self.layout_type == LAYOUT_TABLE:
File /opt/conda/lib/python3.10/site-packages/textractor/utils/text_utils.py:150, in linearize_children(elements, config, no_new_lines, is_layout_table)
148 added_words = set()
149 for idx, element in enumerate(sorted_group):
--> 150 text_element, words_element = element.get_text_and_words(config)
151 if "Table" in element.__class__.__name__ and len(words_element):
152 result += text_element
File /opt/conda/lib/python3.10/site-packages/textractor/entities/table.py:1033, in Table.get_text_and_words(self, config)
1030 text += (local_config.table_row_suffix if local_config.add_prefixes_and_suffixes_in_text else "")
1031 text += local_config.table_row_separator
-> 1033 if local_config.table_add_title_as_caption and self.title:
1034 text += "<caption>" + self.title.get_text() + "</caption>"
1036 text += (local_config.table_suffix if local_config.add_prefixes_and_suffixes_in_text else "")
UnboundLocalError: local variable 'local_config' referenced before assignment
since the new version release 1.8.0 we are not able to use the method .to_markdown() method.
The workflow we use is as follows (mainly used for pdfs):
using .to_markdown() method we are getting:
let me know if this is an issue from our side.