VikParuchuri / marker

Convert PDF to markdown quickly with high accuracy
https://www.datalab.to
GNU General Public License v3.0
17.61k stars 1.01k forks source link

Feature request: URL extraction #283

Open ShakirAkbari opened 1 month ago

ShakirAkbari commented 1 month ago

Requesting an additional feature when extracting information from PDFs.

Can you please add the ability to extract URLs from the document?

ShakirAkbari commented 1 month ago

I wrote this code to pull link text and links out of pdfs. Maybe you can incorporate part of this into your code base with an option to enable extract_links_from_pdf in the settings:

from gc import get_objects from pypdf import PdfReader from pypdf.annotations import Link import PyPDF2 from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal def parse_pdf(filename): reader = PdfReader(filename) pagenumber = 0 while pagenumber < len(reader.pages): print(f'\n{reader.pages[pagenumber].extract_text()}') pagenumber = pagenumber + 1 def extract_links_from_pdf(pdf_path): links = [] with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page_num, page in enumerate(pdf_reader.pages): page_links = [] if '/Annots' in page: annotations = page['/Annots'] for annotation in annotations: annotation_object = annotation.get_object() if annotation_object['/Subtype'] == '/Link': if '/A' in annotation_object: action = annotation_object['/A'] if '/URI' in action: uri = action['/URI'] if '/Rect' in annotation_object: rect = annotation_object['/Rect'] x1, y1, x2, y2 = [float(coord) for coord in rect] page_links.append((uri, x1, y1, x2, y2)) if page_links: page_text = extract_text_with_positions(pdf_path, page_num) links.extend(associate_text_with_links(page_text, page_links)) return merge_adjacent_links(links) def extract_text_with_positions(pdf_path, page_num): text_with_positions = [] for page_layout in extract_pages(pdf_path, page_numbers=[page_num]): for element in page_layout: if isinstance(element, LTTextContainer): for text_line in element: if isinstance(text_line, LTTextLineHorizontal): for character in text_line: if isinstance(character, LTChar): text_with_positions.append((character.get_text(), character.x0, character.y1)) return text_with_positions def associate_text_with_links(page_text, page_links): associated_links = [] for uri, x1, y1, x2, y2 in page_links: link_text = [] for char, char_x, char_y in page_text: if x1 <= char_x <= x2 and y1 <= char_y <= y2: link_text.append(char) associated_links.append((uri, ''.join(link_text), y1)) return associated_links def merge_adjacent_links(links): merged_links = [] current_url = None current_text = "" for url, text, _ in sorted(links, key=lambda x: (-x[2], x[0])): if url == current_url: current_text += " " + text else: if current_url: merged_links.append((current_url, current_text.strip())) current_url = url current_text = text if current_url: merged_links.append((current_url, current_text.strip())) return merged_links if __name__ == '__main__': pdf_path = 'data/sample.pdf' parse_pdf(pdf_path) print("\n---") extracted_links = extract_links_from_pdf(pdf_path) for link, text in extracted_links: print(f"Link Text: {text}") print(f"Link: {link}") print("---")