Closed Smritiabcd closed 19 hours ago
C:\Users\asus\Desktop\Semantic\amilib\test> pytest test_pdf.py::PDFCharacterTest::test_pdfplumber_full_page_info_LOWLEVEL_CHARS ======================================================= test session starts ======================================================== platform win32 -- Python 3.12.3, pytest-8.2.0, pluggy-1.5.0 rootdir: C:\Users\asus\Desktop\Semantic\amilib collected 1 item
test_pdf.py F [100%]
============================================================= FAILURES ============================================================= __ PDFCharacterTest.test_pdfplumber_full_page_info_LOWLEVEL_CHARS __
self =
def test_pdfplumber_full_page_info_LOWLEVEL_CHARS(self): """The definitive catalog of all objects on a page""" assert PMC1421_PDF.exists(), f"{PMC1421_PDF} should exist" # also ['_text', 'matrix', 'fontname', 'ncs', 'graphicstate', 'adv', 'upright', 'x0', 'y0', 'x1', 'y1', # 'width', 'height', 'bbox', 'size', 'get_text', # 'is_compatible', 'set_bbox', 'is_empty', 'is_hoverlap', # 'hdistance', 'hoverlap', 'is_voverlap', 'vdistance', 'voverlap', 'analyze', '] with pdfplumber.open(PMC1421_PDF) as pdf: first_page = pdf.pages[0] # print(type(first_page), first_page.__dir__()) """ dir: ['pdf', 'root_page', 'page_obj', 'page_number', 'rotation', 'initial_doctop', 'cropbox', 'mediabox', 'bbox', 'cached_properties', 'is_original', 'pages', 'width', 'height', 'layout', 'annots', 'hyperlinks', 'objects', 'process_object', 'iter_layout_objects', 'parse_objects', 'debug_tablefinder', 'find_tables', 'extract_tables', 'extract_table', 'get_text_layout', 'search', 'extract_text', 'extract_words', 'crop', 'within_bbox', 'filter', 'dedupe_chars', 'to_image', 'to_dict', 'flush_cache', 'rects', 'lines', 'curves', 'images', 'chars', 'textboxverticals', 'textboxhorizontals', 'textlineverticals', 'textlinehorizontals', 'rect_edges', 'edges', 'horizontal_edges', 'vertical_edges', 'to_json', 'to_csv', ] """ assert first_page.page_number == 1 assert first_page.rotation == 0 assert first_page.initial_doctop == 0 assert first_page.cropbox == (0, 0, 595.22, 842) assert first_page.mediabox == (0, 0, 595.22, 842) assert first_page.bbox == (0, 0, 595.22, 842) assert first_page.cached_properties == ['_rect_edges', '_curve_edges', '_edges', '_objects', '_layout'] assert first_page.is_original assert first_page.pages is None assert first_page.width == 595.22 assert first_page.height == 842 # assert first_page.layout: < LTPage(1) # 0.000, 0.000, 595.220, 842.000 # rotate = 0 > assert first_page.annots == [] assert first_page.hyperlinks == [] assert len(first_page.objects) == 2 assert type(first_page.objects) is dict assert list(first_page.objects.keys()) == ['char', 'line'] assert len(first_page.objects['char']) == 4411 assert first_page.objects['char'][:2] == [ {'matrix': (9, 0, 0, 9, 319.74, 797.4203), 'mcid': None, 'ncs': 'DeviceCMYK', 'non_stroking_pattern': None, 'stroking_pattern': None, 'tag': None, 'fontname': 'KAAHHD+Calibri,Italic', 'adv': 0.319, 'upright': True, 'x0': 319.74, 'y0': 795.1703, 'x1': 322.611, 'y1': 804.1703, 'width': 2.870999999999981, 'height': 9.0, 'size': 9.0, 'object_type': 'char', 'page_number': 1, 'text': 'J', 'stroking_color': None, 'non_stroking_color': (0.86667, 0.26667, 1, 0.15294), 'top': 37.8297, 'bottom': 46.8297, 'doctop': 37.8297 }, {'matrix': (9, 0, 0, 9, 322.6092, 797.4203), 'fontname': 'KAAHHD+Calibri,Italic', 'adv': 0.513, 'mcid': None, 'ncs': 'DeviceCMYK', 'non_stroking_pattern': None, 'stroking_pattern': None, 'tag': None, 'upright': True, 'x0': 322.6092, 'y0': 795.1703, 'x1': 327.2262, 'y1': 804.1703, 'width': 4.617000000000019, 'height': 9.0, 'size': 9.0, 'object_type': 'char', 'page_number': 1, 'text': 'o', 'stroking_color': None, 'non_stroking_color': (0.86667, 0.26667, 1, 0.15294), 'top': 37.8297, 'bottom': 46.8297, 'doctop': 37.8297}, ], f"first_page.objects['char'][0] {first_page.objects['char'][0]}" assert len(first_page.objects['line']) == 1, f" len(first_page.objects['line'])"
assert first_page.objects['line'][0] == { 'bottom': 48.24000000000001, 'doctop': 48.24000000000001, 'evenodd': False, 'fill': False, 'height': 0.0, 'linewidth': 1, 'mcid': None, 'non_stroking_color': (0,), 'non_stroking_pattern': None, 'object_type': 'line', 'page_number': 1, this may be different y-coord system 'pts': [(56.7, 793.76), (542.76, 793.76)], 'pts': [(56.7, 48.24000000000001), (542.76, 48.24000000000001)], 'stroke': True, 'stroking_color': (0.3098, 0.24706, 0.2549, 0), 'stroking_pattern': None, 'tag': None, 'top': 48.24000000000001, 'width': 486.06, 'x0': 56.7, 'x1': 542.76, 'y0': 793.76, 'y1': 793.76 }, f"first_page.objects['line'][0] {first_page.objects['line'][0]}" E AssertionError: first_page.objects['line'][0] {'x0': 56.7, 'y0': 793.76, 'x1': 542.76, 'y1': 793.76, 'width': 486.06, 'height': 0.0, 'pts': [(56.7, 48.24000000000001), (542.76, 48.24000000000001)], 'linewidth': 1, 'stroke': True, 'fill': False, 'evenodd': False, 'stroking_color': (0.3098, 0.24706, 0.2549, 0), 'non_stroking_color': (0,), 'mcid': None, 'tag': None, 'object_type': 'line', 'page_number': 1, 'stroking_pattern': None, 'non_stroking_pattern': None, 'path': [('m', (56.7, 48.24000000000001)), ('l', (542.76, 48.24000000000001))], 'dash': ([], 0), 'top': 48.24000000000001, 'bottom': 48.24000000000001, 'doctop': 48.24000000000001} E assert {'bottom': 48...': False, ...} == {'bottom': 48...': False, ...} E E Omitting 22 identical items, use -vv to show E Left contains 2 more items: E {'dash': ([], 0), E 'path': [('m', (56.7, 48.24000000000001)), ('l', (542.76, 48.24000000000001))]} E Use -v to get more diff
assert first_page.objects['line'][0] == {
'bottom': 48.24000000000001, 'doctop': 48.24000000000001, 'evenodd': False, 'fill': False, 'height': 0.0, 'linewidth': 1, 'mcid': None, 'non_stroking_color': (0,), 'non_stroking_pattern': None, 'object_type': 'line', 'page_number': 1,
'pts': [(56.7, 48.24000000000001), (542.76, 48.24000000000001)], 'stroke': True, 'stroking_color': (0.3098, 0.24706, 0.2549, 0), 'stroking_pattern': None, 'tag': None, 'top': 48.24000000000001, 'width': 486.06, 'x0': 56.7, 'x1': 542.76, 'y0': 793.76, 'y1': 793.76 }, f"first_page.objects['line'][0] {first_page.objects['line'][0]}" E AssertionError: first_page.objects['line'][0] {'x0': 56.7, 'y0': 793.76, 'x1': 542.76, 'y1': 793.76, 'width': 486.06, 'height': 0.0, 'pts': [(56.7, 48.24000000000001), (542.76, 48.24000000000001)], 'linewidth': 1, 'stroke': True, 'fill': False, 'evenodd': False, 'stroking_color': (0.3098, 0.24706, 0.2549, 0), 'non_stroking_color': (0,), 'mcid': None, 'tag': None, 'object_type': 'line', 'page_number': 1, 'stroking_pattern': None, 'non_stroking_pattern': None, 'path': [('m', (56.7, 48.24000000000001)), ('l', (542.76, 48.24000000000001))], 'dash': ([], 0), 'top': 48.24000000000001, 'bottom': 48.24000000000001, 'doctop': 48.24000000000001} E assert {'bottom': 48...': False, ...} == {'bottom': 48...': False, ...} E E Omitting 22 identical items, use -vv to show E Left contains 2 more items: E {'dash': ([], 0), E 'path': [('m', (56.7, 48.24000000000001)), ('l', (542.76, 48.24000000000001))]} E Use -v to get more diff
test_pdf.py:1093: AssertionError ===================================================== short test summary info ====================================================== FAILED test_pdf.py::PDFCharacterTest::test_pdfplumber_full_page_info_LOWLEVEL_CHARS - AssertionError: first_page.objects['line'][0] {'x0': 56.7, 'y0': 793.76, 'x1': 542.76, 'y1': 793.76, 'width': 486.06, 'height':... ======================================================== 1 failed in 7.88s =========================================================
Ignore complex output. May fail due to minor characters differences
C:\Users\asus\Desktop\Semantic\amilib\test> pytest test_pdf.py::PDFCharacterTest::test_pdfplumber_full_page_info_LOWLEVEL_CHARS ======================================================= test session starts ======================================================== platform win32 -- Python 3.12.3, pytest-8.2.0, pluggy-1.5.0 rootdir: C:\Users\asus\Desktop\Semantic\amilib collected 1 item
test_pdf.py F [100%]
============================================================= FAILURES ============================================================= __ PDFCharacterTest.test_pdfplumber_full_page_info_LOWLEVEL_CHARS __
self =
test_pdf.py:1093: AssertionError ===================================================== short test summary info ====================================================== FAILED test_pdf.py::PDFCharacterTest::test_pdfplumber_full_page_info_LOWLEVEL_CHARS - AssertionError: first_page.objects['line'][0] {'x0': 56.7, 'y0': 793.76, 'x1': 542.76, 'y1': 793.76, 'width': 486.06, 'height':... ======================================================== 1 failed in 7.88s =========================================================