Closed ibrahimshuail closed 3 years ago
Hi @ibrahimshuail Could you please format the code correctly? It appears you used single backtick. For multiline code blocks, use triple backticks. Example
```python
code here
Hi @samkit-jain
Please find the below-attached code
def curves_to_edges(cs):
edges = []
for c in cs:
edges += pdfplumber.utils.rect_to_edges(c)
return edges
all_text = ''
with pdfplumber.open(path) as pdf:
for pdf_page in pdf.pages:
pdf_page = pdf_page.crop((0,50,1000,700))
ts = {"vertical_strategy": "explicit","horizontal_strategy": "explicit","explicit_vertical_lines": curves_to_edges(pdf_page.curves + pdf_page.edges),"explicit_horizontal_lines": curves_to_edges(pdf_page.curves + pdf_page.edges),"intersection_y_tolerance": 10,}
bboxes = [table.bbox for table in pdf_page.find_tables(table_settings=ts)]
def not_within_bboxes(obj):
def obj_in_bbox(_bbox):
v_mid = (obj["top"] + obj["bottom"]) / 2
h_mid = (obj["x0"] + obj["x1"]) / 2
x0, top, x1, bottom = _bbox
return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
return not any(obj_in_bbox(__bbox) for __bbox in bboxes)
single_page_text = pdf_page.filter(not_within_bboxes).extract_text()
# print(single_page_text)
all_text = all_text + '\n' + single_page_text
text=all_text
the issue which we are facing
f"Bounding box {bbox} is not fully within " ValueError: Bounding box (Decimal('0'), Decimal('50'), Decimal('1000'), Decimal('700')) is not fully within parent page bounding box (Decimal('0'), Decimal('0'), Decimal('612'), Decimal('792'))
@ibrahimshuail That is because your page has a width of 612 but in the crop method you are providing 1000. You can update the invocation like
pdf_page = pdf_page.crop((0, 50, min(1000, pdf_page.width), min(700, pdf_page.height)))
thanks this works @samkit-jain
The below set of code stopped working when upgraded to 0.5.24 is there any other altrenative method to crop the page
for pdf_page in pdf.pages: pdf_page = pdf_page.crop((0,50,1000,700)) ts = {"vertical_strategy": "explicit","horizontal_strategy": "explicit","explicit_vertical_lines": curves_to_edges(pdf_page.curves + pdf_page.edges),"explicit_horizontal_lines": curves_to_edges(pdf_page.curves + pdf_page.edges),"intersection_y_tolerance": 10,} bboxes = [table.bbox for table in pdf_page.find_tables(table_settings=ts)] def not_within_bboxes(obj): def obj_in_bbox(_bbox): v_mid = (obj["top"] + obj["bottom"]) / 2 h_mid = (obj["x0"] + obj["x1"]) / 2 x0, top, x1, bottom = _bbox return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom) return not any(obj_in_bbox(__bbox) for __bbox in bboxes) single_page_text = pdf_page.filter(not_within_bboxes).extract_text() all_text = all_text + '\n' + single_page_text