Open NamanS-14 opened 6 months ago
I also got this error. I solved by installing from Github repo or you have to fix PyPDF2 problems in handlers.py
Could you please help me by letting me know that how may I solve in issue in a little bit more details. Which file do I need to install from GitHub as you have mentioned above?
I got the same error and updated the libraries code in file name handler.py now it's working
import os import sys import PyPDF2 from PyPDF2 import PdfFileReader
from .core import TableList from .parsers import Stream, Lattice from .utils import ( TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, download_url, )
class PDFHandler(object): """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
"""
def __init__(self, filepath, pages="1", password=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath = filepath
if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")
if password is None:
self.password = ""
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages)
def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Returns
-------
P : list
List of int page numbers.
"""
page_numbers = []
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
instream = open(filepath, "rb")
infile = PyPDF2.PdfReader(instream, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = infile.getNumPages()
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})
instream.close()
P = []
for p in page_numbers:
P.extend(range(p["start"], p["end"] + 1))
return sorted(set(P))
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
page : int
Page number.
temp : str
Tmp directory.
"""
with open(filepath, "rb") as fileobj:
infile = PyPDF2.PdfReader(fileobj, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PyPDF2.PdfWriter()
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PyPDF2.PdfReader(instream, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
outfile = PyPDF2.PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotateClockwise(90)
elif rotation == "clockwise":
p.rotateCounterClockwise(90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()
def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
Parameters
----------
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
suppress_stdout : str (default: False)
Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
kwargs : dict
See camelot.read_pdf kwargs.
Returns
-------
tables : camelot.core.TableList
List of tables found in PDF.
"""
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages]
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
for p in pages:
t = parser.extract_tables(
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
tables.extend(t)
return TableList(sorted(tables))
Or simple install it : !pip install PyPDF2==2.12.1 !pip install 'camelot-py[base]'
Make sure them two are been uninstalled before
While following every steps in the installation as given there is an error that I am unable to tackle and that is in some file PyPDF2 3.0.0 located in /usr/local/lib/python3.10/dist-packages/PyPDF2/_utils.py. I am attaching the screenshots with the error that it is showing. Please help me to execute the library.![Screenshot 2023-12-31 203453](https://github.com/camelot-dev/camelot/assets/155222049/eb1df837-ad2f-47fc-9091-c5b49253809b)