petermr / amilib

Python library of `ami` software especially NLP, HTML, downloading and related convenience utilities
Apache License 2.0
1 stars 0 forks source link

Equality test fails in pdf chars #16

Open Smritiabcd opened 4 months ago

Smritiabcd commented 4 months ago

C:\Users\asus\Desktop\Semantic\amilib\test> pytest test_pdf.py::PDFCharacterTest::test_pdfplumber_full_page_info_LOWLEVEL_CHARS ======================================================= test session starts ======================================================== platform win32 -- Python 3.12.3, pytest-8.2.0, pluggy-1.5.0 rootdir: C:\Users\asus\Desktop\Semantic\amilib collected 1 item

test_pdf.py F [100%]

============================================================= FAILURES ============================================================= __ PDFCharacterTest.test_pdfplumber_full_page_info_LOWLEVEL_CHARS __

self =

def test_pdfplumber_full_page_info_LOWLEVEL_CHARS(self):
    """The definitive catalog of all objects on a page"""
    assert PMC1421_PDF.exists(), f"{PMC1421_PDF} should exist"

    # also ['_text', 'matrix', 'fontname', 'ncs', 'graphicstate', 'adv', 'upright', 'x0', 'y0', 'x1', 'y1',
    # 'width', 'height', 'bbox', 'size', 'get_text',
    # 'is_compatible', 'set_bbox', 'is_empty', 'is_hoverlap',
    # 'hdistance', 'hoverlap', 'is_voverlap', 'vdistance', 'voverlap', 'analyze', ']
    with pdfplumber.open(PMC1421_PDF) as pdf:
        first_page = pdf.pages[0]
        # print(type(first_page), first_page.__dir__())
        """
        dir: ['pdf', 'root_page', 'page_obj', 'page_number', 'rotation', 'initial_doctop', 'cropbox', 'mediabox',
        'bbox', 'cached_properties', 'is_original', 'pages', 'width',
        'height', 'layout', 'annots', 'hyperlinks', 'objects', 'process_object', 'iter_layout_objects', 'parse_objects',
        'debug_tablefinder', 'find_tables', 'extract_tables', 'extract_table', 'get_text_layout', 'search', 'extract_text',
         'extract_words', 'crop', 'within_bbox', 'filter', 'dedupe_chars', 'to_image', 'to_dict',
         'flush_cache', 'rects', 'lines', 'curves', 'images', 'chars', 'textboxverticals', 'textboxhorizontals',
         'textlineverticals', 'textlinehorizontals', 'rect_edges', 'edges', 'horizontal_edges', 'vertical_edges', 'to_json',
          'to_csv', ]
        """
        assert first_page.page_number == 1
        assert first_page.rotation == 0
        assert first_page.initial_doctop == 0
        assert first_page.cropbox == (0, 0, 595.22, 842)
        assert first_page.mediabox == (0, 0, 595.22, 842)
        assert first_page.bbox == (0, 0, 595.22, 842)
        assert first_page.cached_properties == ['_rect_edges', '_curve_edges', '_edges', '_objects', '_layout']
        assert first_page.is_original
        assert first_page.pages is None
        assert first_page.width == 595.22
        assert first_page.height == 842
        # assert first_page.layout: < LTPage(1)
        # 0.000, 0.000, 595.220, 842.000
        # rotate = 0 >
        assert first_page.annots == []
        assert first_page.hyperlinks == []
        assert len(first_page.objects) == 2
        assert type(first_page.objects) is dict
        assert list(first_page.objects.keys()) == ['char', 'line']
        assert len(first_page.objects['char']) == 4411
        assert first_page.objects['char'][:2] == [
            {'matrix': (9, 0, 0, 9, 319.74, 797.4203),
             'mcid': None,
             'ncs': 'DeviceCMYK',
             'non_stroking_pattern': None,
             'stroking_pattern': None,
             'tag': None,
             'fontname': 'KAAHHD+Calibri,Italic',
             'adv': 0.319,
             'upright': True,
             'x0': 319.74, 'y0': 795.1703, 'x1': 322.611, 'y1': 804.1703,
             'width': 2.870999999999981, 'height': 9.0, 'size': 9.0,
             'object_type': 'char', 'page_number': 1,
             'text': 'J', 'stroking_color': None, 'non_stroking_color': (0.86667, 0.26667, 1, 0.15294),
             'top': 37.8297, 'bottom': 46.8297, 'doctop': 37.8297
             },
            {'matrix': (9, 0, 0, 9, 322.6092, 797.4203), 'fontname': 'KAAHHD+Calibri,Italic', 'adv': 0.513,
             'mcid': None,
             'ncs': 'DeviceCMYK',
             'non_stroking_pattern': None,
             'stroking_pattern': None,
             'tag': None,
             'upright': True,
             'x0': 322.6092, 'y0': 795.1703, 'x1': 327.2262, 'y1': 804.1703, 'width': 4.617000000000019,
             'height': 9.0, 'size': 9.0,
             'object_type': 'char', 'page_number': 1, 'text': 'o', 'stroking_color': None,
             'non_stroking_color': (0.86667, 0.26667, 1, 0.15294),
             'top': 37.8297, 'bottom': 46.8297, 'doctop': 37.8297},
        ], f"first_page.objects['char'][0]  {first_page.objects['char'][0]}"
        assert len(first_page.objects['line']) == 1, f" len(first_page.objects['line'])"
      assert first_page.objects['line'][0] == {

'bottom': 48.24000000000001, 'doctop': 48.24000000000001, 'evenodd': False, 'fill': False, 'height': 0.0, 'linewidth': 1, 'mcid': None, 'non_stroking_color': (0,), 'non_stroking_pattern': None, 'object_type': 'line', 'page_number': 1,

this may be different y-coord system

'pts': [(56.7, 793.76), (542.76, 793.76)],

'pts': [(56.7, 48.24000000000001), (542.76, 48.24000000000001)], 'stroke': True, 'stroking_color': (0.3098, 0.24706, 0.2549, 0), 'stroking_pattern': None, 'tag': None, 'top': 48.24000000000001, 'width': 486.06, 'x0': 56.7, 'x1': 542.76, 'y0': 793.76, 'y1': 793.76 }, f"first_page.objects['line'][0] {first_page.objects['line'][0]}" E AssertionError: first_page.objects['line'][0] {'x0': 56.7, 'y0': 793.76, 'x1': 542.76, 'y1': 793.76, 'width': 486.06, 'height': 0.0, 'pts': [(56.7, 48.24000000000001), (542.76, 48.24000000000001)], 'linewidth': 1, 'stroke': True, 'fill': False, 'evenodd': False, 'stroking_color': (0.3098, 0.24706, 0.2549, 0), 'non_stroking_color': (0,), 'mcid': None, 'tag': None, 'object_type': 'line', 'page_number': 1, 'stroking_pattern': None, 'non_stroking_pattern': None, 'path': [('m', (56.7, 48.24000000000001)), ('l', (542.76, 48.24000000000001))], 'dash': ([], 0), 'top': 48.24000000000001, 'bottom': 48.24000000000001, 'doctop': 48.24000000000001} E assert {'bottom': 48...': False, ...} == {'bottom': 48...': False, ...} E E Omitting 22 identical items, use -vv to show E Left contains 2 more items: E {'dash': ([], 0), E 'path': [('m', (56.7, 48.24000000000001)), ('l', (542.76, 48.24000000000001))]} E Use -v to get more diff

test_pdf.py:1093: AssertionError ===================================================== short test summary info ====================================================== FAILED test_pdf.py::PDFCharacterTest::test_pdfplumber_full_page_info_LOWLEVEL_CHARS - AssertionError: first_page.objects['line'][0] {'x0': 56.7, 'y0': 793.76, 'x1': 542.76, 'y1': 793.76, 'width': 486.06, 'height':... ======================================================== 1 failed in 7.88s =========================================================