Closed petermr closed 19 hours ago
6 errors in PDF reading
(base) pm286macbook-2:amilib pm286$ python -m pytest ===================================== test session starts ====================================== platform darwin -- Python 3.8.3, pytest-6.2.5, py-1.9.0, pluggy-0.13.1 rootdir: /Users/pm286/workspace/amilib plugins: cov-3.0.0 collected 220 items test/test_file.py ss [ 0%] test/test_headless.py ss..sssssss.....s [ 8%] test/test_html.py ...s.s......s..s.....ssss...s.........s..ssssss..ss.ss..ss............ [ 40%] ....................s....s [ 52%] test/test_nlp.py . [ 52%] test/test_pdf.py .ssF.......s.s.s.sssssssFs..F.ss.ss.FsF..Fssssssssss..s....ss [ 80%] test/test_pytest.py . [ 80%] test/test_stat.py . [ 81%] test/test_svg.py ... [ 82%] test/test_util.py ss.....s...s... [ 89%] test/test_wikidata.py .s...........s....... [ 99%] test/test_xml.py .. [100%] =========================================== FAILURES =========================================== ____________________ PDFPlumberTest.test_pdfplumber_json_single_page_debug _____________________ self = <test.test_pdf.PDFPlumberTest testMethod=test_pdfplumber_json_single_page_debug> def test_pdfplumber_json_single_page_debug(self): """creates AmiPDFPlumber and reads pdf and debugs""" path = Path(os.path.join(HERE, "resources/pdffill-demo.pdf")) assert path.exists, f"{path} should exist" ami_pdfplumber = AmiPDFPlumber() ami_plumber_json = ami_pdfplumber.create_ami_plumber_json(path) > pages = ami_plumber_json.get_ami_json_pages() E AttributeError: 'NoneType' object has no attribute 'get_ami_json_pages' test/test_pdf.py:133: AttributeError ------------------------------------- Captured stdout call ------------------------------------- ERROR open() takes 2 positional arguments but 3 were given for /Users/pm286/workspace/amilib/test/resources/pdffill-demo.pdf Cannot create PDF /Users/pm286/workspace/amilib/test/resources/pdffill-demo.pdf _________________________ PDFChapterTest.test_read_ipcc_chapter__debug _________________________ self = <test.test_pdf.PDFChapterTest testMethod=test_read_ipcc_chapter__debug> def test_read_ipcc_chapter__debug(self): """read multipage document and extract properties """ assert IPCC_GLOSSARY.exists(), f"{IPCC_GLOSSARY} should exist" max_page = PDFTest.MAX_PAGE # max_page = 999999 options = [WORDS, ANNOTS] # max_page = 100 # increase this if yu want more output for (pdf_file, page_count) in [ # (IPCC_GLOSSARY, 51), (Resources.TEST_IPCC_CHAP06_PDF, 219) ]: pdf_debug = PDFDebug() with pdfplumber.open(pdf_file) as pdf: print(f"file {pdf_file}") pages = list(pdf.pages) assert len(pages) == page_count for page in pages[:max_page]: > pdf_debug.debug_page_properties(page, debug=options) test/test_pdf.py:756: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ amilib/ami_pdf_libs.py:760: in debug_page_properties self.print_annots(page) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <amilib.ami_pdf_libs.PDFDebug object at 0x7fbda0561790> page = <pdfplumber.page.Page object at 0x7fbdd0e66af0> def print_annots(self, page): """Prints annots Here's the output of one (it's a hyperlink) annot: dict_items( [ ('page_number', 4), ('object_type', 'annot'), ('x0', 80.75), ('y0', 698.85), ('x1', 525.05), ('y1', 718.77), ('doctop', 2648.91), ('top', 123.14999999999998), ('bottom', 143.06999999999994), ('width', 444.29999999999995), ('height', 19.91999999999996), ('uri', None), ('title', None), ('contents', None), ('data', {'BS': {'W': 0}, 'Dest': [<PDFObjRef:7>, /'XYZ', 69, 769, 0], 'F': 4, 'Rect': [80.75, 698.85, 525.05, 718.77], 'StructParent': 3, 'Subtype': /'Link' } ) ] ) and there are 34 (in a TableOfContents) and they work """ > n_annot = len(page.annots) E AttributeError: 'Page' object has no attribute 'annots' amilib/ami_pdf_libs.py:958: AttributeError ------------------------------------- Captured stdout call ------------------------------------- file /Users/pm286/workspace/amilib/test/resources/ar6/Chapter06/fulltext.pdf ======page: 1 =========== W: {'x0': Decimal('149.340'), 'x1': Decimal('170.906'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'WG'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('174.114'), 'x1': Decimal('185.284'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'III'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('188.443'), 'x1': Decimal('260.930'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'contribution'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('264.089'), 'x1': Decimal('276.531'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'to'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('279.684'), 'x1': Decimal('299.069'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'the'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 318 ['WG', 'III', 'contribution', 'to', 'the'] ... | _ PDFCharacterTest.test_debug_page_properties_chap6_word_count_and_images_data_wg3_old__example _ self = <test.test_pdf.PDFCharacterTest testMethod=test_debug_page_properties_chap6_word_count_and_images_data_wg3_old__example> def test_debug_page_properties_chap6_word_count_and_images_data_wg3_old__example(self): """debug the old-style IPCC WG3 PDF objects (crude) outputs wordcount for page, and any image data. Would be better if we knew how to read PDFStream """ maxpage = 9 # images on page 8, and 9 outdir = Path(AmiAnyTest.TEMP_DIR, "pdf", "ar6", "chap6") pdf_debug = PDFDebug() with pdfplumber.open(Resources.TEST_IPCC_CHAP06_PDF) as pdf: pages = list(pdf.pages) for page in pages[:maxpage]: pdf_debug.debug_page_properties(page, debug=[WORDS, IMAGES], outdir=outdir) pdf_debug.write_summary(outdir=outdir) print(f"pdf_debug {pdf_debug.image_dict}\n outdir {outdir}") > assert maxpage != 9 or pdf_debug.image_dict == { ((1397, 779), 143448): (8, (72.0, 523.3), (412.99, 664.64)), ((1466, 655), 122016): (8, (72.0, 523.3), (203.73, 405.38)), ((1634, 854), 204349): (9, (80.9, 514.25), (543.43, 769.92)) } E AssertionError: assert (9 != 9 or {((Decimal('1...('769.920')))} == {((1397, 779)....43, 769.92))} E Differing items: E {((1397, 779), 143448): (8, (Decimal('72'), Decimal('523.300')), (Decimal('412.990'), Decimal('664.640')))} != {((1397, 779), 143448): (8, (72.0, 523.3), (412.99, 664.64))} E {((1634, 854), 204349): (9, (Decimal('80.900'), Decimal('514.250')), (Decimal('543.430'), Decimal('769.920')))} != {((1634, 854), 204349): (9, (80.9, 514.25), (543.43, 769.92))} E {((1466, 655), 122016): (8, (Decimal('72'), Decimal('523.300')), (Decimal('203.730'), Decimal('405.380')))} != {((1466, 655), 122016): (8, (72.0, 523.3), (203.73, 405.38))} E Use -v to get the full diff) test/test_pdf.py:1342: AssertionError ------------------------------------- Captured stdout call ------------------------------------- ======page: 1 =========== image_dict {} W: {'x0': Decimal('149.340'), 'x1': Decimal('170.906'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'WG'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('174.114'), 'x1': Decimal('185.284'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'III'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('188.443'), 'x1': Decimal('260.930'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'contribution'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('264.089'), 'x1': Decimal('276.531'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'to'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('279.684'), 'x1': Decimal('299.069'), 'top': Decimal('69.655'), 'bottom': Decimal('89.059'), 'text': 'the'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 318 ['WG', 'III', 'contribution', 'to', 'the'] ... | ======page: 2 =========== image_dict {} W: {'x0': Decimal('76.500'), 'x1': Decimal('109.035'), 'top': Decimal('70.539'), 'bottom': Decimal('83.946'), 'text': 'Chapter'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('111.242'), 'x1': Decimal('116.322'), 'top': Decimal('70.539'), 'bottom': Decimal('83.946'), 'text': '6'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('133.200'), 'x1': Decimal('143.380'), 'top': Decimal('70.539'), 'bottom': Decimal('83.946'), 'text': '44'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('189.900'), 'x1': Decimal('213.280'), 'top': Decimal('70.539'), 'bottom': Decimal('83.946'), 'text': '41-42'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('259.380'), 'x1': Decimal('294.045'), 'top': Decimal('70.539'), 'bottom': Decimal('83.946'), 'text': 'Replace:'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 66 ['Chapter', '6', '44', '41-42', 'Replace:'] ... | ======page: 3 =========== image_dict {} W: {'x0': Decimal('189.290'), 'x1': Decimal('246.076'), 'top': Decimal('88.215'), 'bottom': Decimal('102.467'), 'text': 'Chapter'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('250.002'), 'x1': Decimal('263.344'), 'top': Decimal('88.215'), 'bottom': Decimal('102.467'), 'text': '6:'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('297.290'), 'x1': Decimal('347.021'), 'top': Decimal('88.215'), 'bottom': Decimal('102.467'), 'text': 'Energy'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('351.070'), 'x1': Decimal('406.084'), 'top': Decimal('88.215'), 'bottom': Decimal('102.467'), 'text': 'Systems'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('48.480'), 'x1': Decimal('54.000'), 'top': Decimal('91.369'), 'bottom': Decimal('101.405'), 'text': '1'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 278 ['Chapter', '6:', 'Energy', 'Systems', '1'] ... | ======page: 4 =========== image_dict {} W: {'x0': Decimal('72.024'), 'x1': Decimal('94.656'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Final'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('97.460'), 'x1': Decimal('152.461'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Government'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('155.277'), 'x1': Decimal('208.291'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Distribution'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('276.170'), 'x1': Decimal('311.034'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Chapter'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('313.827'), 'x1': Decimal('319.347'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': '6'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 348 ['Final', 'Government', 'Distribution', 'Chapter', '6'] ... | ======page: 5 =========== image_dict {} W: {'x0': Decimal('72.024'), 'x1': Decimal('94.656'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Final'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('97.460'), 'x1': Decimal('152.461'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Government'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('155.277'), 'x1': Decimal('208.291'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Distribution'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('276.170'), 'x1': Decimal('311.034'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Chapter'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('313.827'), 'x1': Decimal('319.347'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': '6'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 746 ['Final', 'Government', 'Distribution', 'Chapter', '6'] ... | ======page: 6 =========== image_dict {} W: {'x0': Decimal('72.024'), 'x1': Decimal('94.656'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Final'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('97.460'), 'x1': Decimal('152.461'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Government'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('155.277'), 'x1': Decimal('208.291'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Distribution'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('276.170'), 'x1': Decimal('311.034'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Chapter'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('313.827'), 'x1': Decimal('319.347'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': '6'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 715 ['Final', 'Government', 'Distribution', 'Chapter', '6'] ... | ======page: 7 =========== image_dict {} W: {'x0': Decimal('72.024'), 'x1': Decimal('94.656'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Final'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('97.460'), 'x1': Decimal('152.461'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Government'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('155.277'), 'x1': Decimal('208.291'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Distribution'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('276.170'), 'x1': Decimal('311.034'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Chapter'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('313.827'), 'x1': Decimal('319.347'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': '6'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 342 ['Final', 'Government', 'Distribution', 'Chapter', '6'] ... | ======page: 8 =========== images 2 | image: <class 'dict'>: dict_keys(['x0', 'y0', 'x1', 'y1', 'width', 'height', 'name', 'stream', 'srcsize', 'imagemask', 'bits', 'colorspace', 'object_type', 'page_number', 'top', 'bottom', 'doctop']) dict_values([Decimal('72'), Decimal('412.990'), Decimal('523.300'), Decimal('664.640'), Decimal('451.300'), Decimal('251.650'), 'Im0', <PDFStream(15): raw=143450, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'DCTDecode', 'Height': 779, 'Interpolate': True, 'Length': 143448, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 1397}>, (Decimal('1397'), Decimal('779')), None, 8, [/'DeviceRGB'], 'image', 8, Decimal('177.280'), Decimal('428.930'), Decimal('6070.720')]) stream <PDFStream(15): raw=143450, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'DCTDecode', 'Height': 779, 'Interpolate': True, 'Length': 143448, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 1397}> keys dict_keys(['x0', 'y0', 'x1', 'y1', 'width', 'height', 'name', 'stream', 'srcsize', 'imagemask', 'bits', 'colorspace', 'object_type', 'page_number', 'top', 'bottom', 'doctop']) xxyy ((Decimal('72'), Decimal('523.300')), (Decimal('412.990'), Decimal('664.640')), (Decimal('1397'), Decimal('779')), 'Im0', 8) image: ((Decimal('1397'), Decimal('779')), 143448) => (8, (Decimal('72'), Decimal('523.300')), (Decimal('412.990'), Decimal('664.640'))) image: <class 'dict'>: dict_keys(['x0', 'y0', 'x1', 'y1', 'width', 'height', 'name', 'stream', 'srcsize', 'imagemask', 'bits', 'colorspace', 'object_type', 'page_number', 'top', 'bottom', 'doctop']) dict_values([Decimal('72'), Decimal('203.730'), Decimal('523.300'), Decimal('405.380'), Decimal('451.300'), Decimal('201.650'), 'Im1', <PDFStream(16): raw=122018, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'DCTDecode', 'Height': 655, 'Interpolate': True, 'Length': 122016, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 1466}>, (Decimal('1466'), Decimal('655')), None, 8, [/'DeviceRGB'], 'image', 8, Decimal('436.540'), Decimal('638.190'), Decimal('6329.980')]) stream <PDFStream(16): raw=122018, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'DCTDecode', 'Height': 655, 'Interpolate': True, 'Length': 122016, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 1466}> keys dict_keys(['x0', 'y0', 'x1', 'y1', 'width', 'height', 'name', 'stream', 'srcsize', 'imagemask', 'bits', 'colorspace', 'object_type', 'page_number', 'top', 'bottom', 'doctop']) xxyy ((Decimal('72'), Decimal('523.300')), (Decimal('203.730'), Decimal('405.380')), (Decimal('1466'), Decimal('655')), 'Im1', 8) image: ((Decimal('1466'), Decimal('655')), 122016) => (8, (Decimal('72'), Decimal('523.300')), (Decimal('203.730'), Decimal('405.380'))) image_dict {((Decimal('1397'), Decimal('779')), 143448): (8, (Decimal('72'), Decimal('523.300')), (Decimal('412.990'), Decimal('664.640'))), ((Decimal('1466'), Decimal('655')), 122016): (8, (Decimal('72'), Decimal('523.300')), (Decimal('203.730'), Decimal('405.380')))} W: {'x0': Decimal('72.024'), 'x1': Decimal('94.656'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Final'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('97.460'), 'x1': Decimal('152.461'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Government'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('155.277'), 'x1': Decimal('208.291'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Distribution'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('276.170'), 'x1': Decimal('311.034'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Chapter'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('313.827'), 'x1': Decimal('319.347'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': '6'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 140 ['Final', 'Government', 'Distribution', 'Chapter', '6'] ... | ======page: 9 =========== images 1 | image: <class 'dict'>: dict_keys(['x0', 'y0', 'x1', 'y1', 'width', 'height', 'name', 'stream', 'srcsize', 'imagemask', 'bits', 'colorspace', 'object_type', 'page_number', 'top', 'bottom', 'doctop']) dict_values([Decimal('80.900'), Decimal('543.430'), Decimal('514.250'), Decimal('769.920'), Decimal('433.350'), Decimal('226.490'), 'Im0', <PDFStream(19): raw=204351, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 854, 'Interpolate': False, 'Length': 204349, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 1634}>, (Decimal('1634'), Decimal('854')), None, 8, [/'DeviceRGB'], 'image', 9, Decimal('72.000'), Decimal('298.490'), Decimal('6807.360')]) stream <PDFStream(19): raw=204351, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 854, 'Interpolate': False, 'Length': 204349, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 1634}> keys dict_keys(['x0', 'y0', 'x1', 'y1', 'width', 'height', 'name', 'stream', 'srcsize', 'imagemask', 'bits', 'colorspace', 'object_type', 'page_number', 'top', 'bottom', 'doctop']) xxyy ((Decimal('80.900'), Decimal('514.250')), (Decimal('543.430'), Decimal('769.920')), (Decimal('1634'), Decimal('854')), 'Im0', 9) image: ((Decimal('1634'), Decimal('854')), 204349) => (9, (Decimal('80.900'), Decimal('514.250')), (Decimal('543.430'), Decimal('769.920'))) image_dict {((Decimal('1397'), Decimal('779')), 143448): (8, (Decimal('72'), Decimal('523.300')), (Decimal('412.990'), Decimal('664.640'))), ((Decimal('1466'), Decimal('655')), 122016): (8, (Decimal('72'), Decimal('523.300')), (Decimal('203.730'), Decimal('405.380'))), ((Decimal('1634'), Decimal('854')), 204349): (9, (Decimal('80.900'), Decimal('514.250')), (Decimal('543.430'), Decimal('769.920')))} W: {'x0': Decimal('72.024'), 'x1': Decimal('94.656'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Final'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('97.460'), 'x1': Decimal('152.461'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Government'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('155.277'), 'x1': Decimal('208.291'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Distribution'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('276.170'), 'x1': Decimal('311.034'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': 'Chapter'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) W: {'x0': Decimal('313.827'), 'x1': Decimal('319.347'), 'top': Decimal('38.069'), 'bottom': Decimal('48.105'), 'text': '6'} dict_keys(['x0', 'x1', 'top', 'bottom', 'text']) words 435 ['Final', 'Government', 'Distribution', 'Chapter', '6'] ... | wrote image coords to /Users/pm286/workspace/amilib/temp/pdf/ar6/chap6/image_coords.txt pdf_debug {((Decimal('1397'), Decimal('779')), 143448): (8, (Decimal('72'), Decimal('523.300')), (Decimal('412.990'), Decimal('664.640'))), ((Decimal('1466'), Decimal('655')), 122016): (8, (Decimal('72'), Decimal('523.300')), (Decimal('203.730'), Decimal('405.380'))), ((Decimal('1634'), Decimal('854')), 204349): (9, (Decimal('80.900'), Decimal('514.250')), (Decimal('543.430'), Decimal('769.920')))} outdir /Users/pm286/workspace/amilib/temp/pdf/ar6/chap6 ___________________ PDFCharacterTest.test_pdfminer_font_and_character_output ___________________ self = <test.test_pdf.PDFCharacterTest testMethod=test_pdfminer_font_and_character_output> @unittest.skipUnless(PDFTest.DEBUG, "too much output") def test_pdfminer_font_and_character_output(self): """Examines every character and annotates it Typical: LTPage LTTextBoxHorizontal Journal of Medicine and Life Volume 7, Special Issue 3, 2014 LTTextLineHorizontal Journal of Medicine and Life Volume 7, Special Issue 3, 2014 LTChar KAAHHD+Calibri,Itali J LTChar KAAHHD+Calibri,Itali o LTChar KAAHHD+Calibri,Itali u """ MAXITEM = 2 from pathlib import Path from typing import Iterable, Any > from pdfminer.high_level import extract_pages E ImportError: cannot import name 'extract_pages' from 'pdfminer.high_level' (/opt/anaconda3/lib/python3.8/site-packages/pdfminer/high_level.py) test/test_pdf.py:858: ImportError _____________________________ PDFCharacterTest.test_pdfminer_style _____________________________ self = <test.test_pdf.PDFCharacterTest testMethod=test_pdfminer_style> def test_pdfminer_style(self): """Examines every character and annotates it Typical: LTPage LTTextBoxHorizontal Journal of Medicine and Life Volume 7, Special Issue 3, 2014 LTTextLineHorizontal Journal of Medicine and Life Volume 7, Special Issue 3, 2014 LTChar KAAHHD+Calibri,Itali J LTChar KAAHHD+Calibri,Itali o LTChar KAAHHD+Calibri,Itali u """ from pathlib import Path from typing import Iterable, Any > from pdfminer.high_level import extract_pages E ImportError: cannot import name 'extract_pages' from 'pdfminer.high_level' (/opt/anaconda3/lib/python3.8/site-packages/pdfminer/high_level.py) test/test_pdf.py:961: ImportError ________________ PDFCharacterTest.test_pdfplumber_full_page_info_LOWLEVEL_CHARS ________________ self = <test.test_pdf.PDFCharacterTest testMethod=test_pdfplumber_full_page_info_LOWLEVEL_CHARS> def test_pdfplumber_full_page_info_LOWLEVEL_CHARS(self): """The definitive catalog of all objects on a page""" assert PMC1421_PDF.exists(), f"{PMC1421_PDF} should exist" include_float = False # don't test if values are floats # TODO use pytest.approx or similar # also ['_text', 'matrix', 'fontname', 'ncs', 'graphicstate', 'adv', 'upright', 'x0', 'y0', 'x1', 'y1', # 'width', 'height', 'bbox', 'size', 'get_text', # 'is_compatible', 'set_bbox', 'is_empty', 'is_hoverlap', # 'hdistance', 'hoverlap', 'is_voverlap', 'vdistance', 'voverlap', 'analyze', '] with pdfplumber.open(PMC1421_PDF) as pdf: first_page = pdf.pages[0] # print(type(first_page), first_page.__dir__()) """ dir: ['pdf', 'root_page', 'page_obj', 'page_number', 'rotation', 'initial_doctop', 'cropbox', 'mediabox', 'bbox', 'cached_properties', 'is_original', 'pages', 'width', 'height', 'layout', 'annots', 'hyperlinks', 'objects', 'process_object', 'iter_layout_objects', 'parse_objects', 'debug_tablefinder', 'find_tables', 'extract_tables', 'extract_table', 'get_text_layout', 'search', 'extract_text', 'extract_words', 'crop', 'within_bbox', 'filter', 'dedupe_chars', 'to_image', 'to_dict', 'flush_cache', 'rects', 'lines', 'curves', 'images', 'chars', 'textboxverticals', 'textboxhorizontals', 'textlineverticals', 'textlinehorizontals', 'rect_edges', 'edges', 'horizontal_edges', 'vertical_edges', 'to_json', 'to_csv', ] """ assert first_page.page_number == 1 assert first_page.rotation == 0 assert first_page.initial_doctop == 0 # cropbox and medibox seem to vary beteween lists and tuples on different versionns of Python # assert first_page.cropbox == (0, 0, 595.22, 842) # assert first_page.mediabox == (0, 0, 595.22, 842) # assert first_page.bbox == (0, 0, 595.22, 842) > assert first_page.cached_properties == ['_rect_edges', '_curve_edges', '_edges', '_objects', '_layout'] E AssertionError: assert ['_rect_edges...s', '_layout'] == ['_rect_edges...s', '_layout'] E At index 1 diff: '_edges' != '_curve_edges' E Right contains one more item: '_layout' E Use -v to get the full diff test/test_pdf.py:1054: AssertionError ======================================= warnings summary ======================================= ../../../../opt/anaconda3/lib/python3.8/site-packages/numexpr/expressions.py:21 ../../../../opt/anaconda3/lib/python3.8/site-packages/numexpr/expressions.py:21 /opt/anaconda3/lib/python3.8/site-packages/numexpr/expressions.py:21: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. _np_version_forbids_neg_powint = LooseVersion(numpy.__version__) >= LooseVersion('1.12.0b1') ../../.local/lib/python3.8/site-packages/requests/__init__.py:87 /Users/pm286/.local/lib/python3.8/site-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.2.1) or chardet (4.0.0) doesn't match a supported version! warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " test/test_nlp.py::NLPTest::test_compute_text_similarity_STAT /opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:525: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( test/test_nlp.py::NLPTest::test_compute_text_similarity_STAT /opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:408: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['abov', 'afterward', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 'anyon', 'anyth', 'anywher', 'becam', 'becaus', 'becom', 'befor', 'besid', 'cri', 'describ', 'dure', 'els', 'elsewher', 'empti', 'everi', 'everyon', 'everyth', 'everywher', 'fifti', 'formerli', 'forti', 'ha', 'henc', 'hereaft', 'herebi', 'hi', 'howev', 'hundr', 'inde', 'latterli', 'mani', 'meanwhil', 'moreov', 'mostli', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 'onli', 'otherwis', 'ourselv', 'perhap', 'pleas', 'seriou', 'sever', 'sinc', 'sincer', 'sixti', 'someon', 'someth', 'sometim', 'somewher', 'themselv', 'thenc', 'thereaft', 'therebi', 'therefor', 'thi', 'thu', 'togeth', 'twelv', 'twenti', 'veri', 'wa', 'whatev', 'whenc', 'whenev', 'wherea', 'whereaft', 'wherebi', 'wherev', 'whi', 'yourselv'] not in stop_words. warnings.warn( test/test_pdf.py::PDFCharacterTest::test_download_all_hlab_shifts_convert_to_html test/test_wikidata.py::TestWikidataLookup_WIKI_NET::test_multiple_ids test/test_wikidata.py::TestWikidataLookup_WIKI_NET::test_simple_wikidata_query test/test_wikidata.py::TestWikidataLookup_WIKI_NET::test_wikidata_extractor test/test_wikidata.py::TestWikidataLookup_WIKI_NET::test_wikidata_extractor test/test_wikidata.py::TestWikidataLookup_WIKI_NET::test_wikidata_id_lookup test/test_wikidata.py::TestWikidataLookup_WIKI_NET::test_wikidata_id_lookup /opt/anaconda3/lib/python3.8/site-packages/urllib3/poolmanager.py:316: DeprecationWarning: The 'strict' parameter is no longer needed on Python 3+. This will raise an error in urllib3 v2.1.0. warnings.warn( test/test_stat.py::TestStat::test_plot_scatter_noel_oboyle_STAT_PLOT /opt/anaconda3/lib/python3.8/site-packages/sklearn/manifold/_mds.py:298: FutureWarning: The default value of `normalized_stress` will change to `'auto'` in version 1.4. To suppress this warning, manually set the value of `normalized_stress`. warnings.warn( -- Docs: https://docs.pytest.org/en/stable/warnings.html =================================== short test summary info ==================================== FAILED test/test_pdf.py::PDFPlumberTest::test_pdfplumber_json_single_page_debug - AttributeEr... FAILED test/test_pdf.py::PDFChapterTest::test_read_ipcc_chapter__debug - AttributeError: 'Pag... FAILED test/test_pdf.py::PDFCharacterTest::test_debug_page_properties_chap6_word_count_and_images_data_wg3_old__example FAILED test/test_pdf.py::PDFCharacterTest::test_pdfminer_font_and_character_output - ImportEr... FAILED test/test_pdf.py::PDFCharacterTest::test_pdfminer_style - ImportError: cannot import n... FAILED test/test_pdf.py::PDFCharacterTest::test_pdfplumber_full_page_info_LOWLEVEL_CHARS - As... ============== 6 failed, 141 passed, 73 skipped, 13 warnings in 69.46s (0:01:09) ====
6 errors in PDF reading