ffalt / pdf.js-extract

nodejs lib for extracting data from PDF files
Other
196 stars 50 forks source link

Fix for Y coordinate with new version of pdfjs #47

Open MCMattia opened 6 months ago

MCMattia commented 6 months ago

Hi @ffalt thank you a lot for this project. I have successfully been using your extractBuffer function in a browser environment.

Working with pdfjs-dist V4.0.269 I noticed that the y coordinate is slightly wrong. If you consider upgrading pdfjs I had success calculating the y coordinate in the following way:

page.getTextContent().then((content) => {
    // Content contains lots of information about the text layout and styles, but we need only strings at the moment
    pag.content = content.items.map((item) => {
        const tx = Util.transform(viewport.transform, item.transform);
        return {
            x: tx[4],
            y: tx[5] - item.height,
            str: item.str,
            dir: item.dir,
            width: item.width,
            height: item.height,
            fontName: item.fontName
        };
    });
})

This would replace the block that you currently have here

I hope this will be of help

sujjeee commented 3 months ago

Hey @MCMattia how did you run this in a browser environment? I'm trying to use Cloudflare Workers, and it's not working there. Could you please let me know how you ran this in a browser environment?

MCMattia commented 3 months ago

Hi @sujjeee,

I adapted the code of this repo. This is my version:

ExtractorPdf.ts:

import { getDocument, Util } from "pdfjs-dist";
import type { TextItem } from "pdfjs-dist/types/src/display/api";
import type { PDFDocumentProxy } from "pdfjs-dist";
import type { TextBox } from "./ExtractorTypes";

export async function extractTextBoxesFromDocument(
    document: PDFDocumentProxy,
    pageNumbers?: number[]
) {
    const textBoxes: TextBox[] = [];
    const data = await extractPdfDataFromDocument(document, pageNumbers);
    for (const page of data.pages) {
        const pageNumber = page.pageInfo.num;
        for (const content of page.content) {
            textBoxes.push({
                x: content.x,
                y: content.y,
                width: content.width,
                height: content.height,
                pageNumber: pageNumber,
                text: content.text
            });
        }
    }
    return textBoxes;
}

export async function extractPdfData(pdfName: string, pageNumbers?: number[]) {
    const document = await getDocument(pdfName).promise;

    return extractPdfDataFromDocument(document, pageNumbers);
}

export async function extractPdfDataFromDocument(
    document: PDFDocumentProxy,
    pageNumbers?: number[]
) {
    const firstPage = 1;
    const lastPage = document.numPages;
    const metadata = await document.getMetadata();
    const result: PDFExtractResult = {
        meta: {
            info: metadata.info,
            metadata: metadata.metadata ? metadata.metadata.getAll() || null : null
        },
        pages: [],
        pdfInfo: {
            numPages: lastPage,
            fingerprint: ""
        }
    };

    if (!pageNumbers) {
        pageNumbers = [];
        for (let i = 1; i <= lastPage; i++) pageNumbers.push(i);
    }

    for (const pageNumber of pageNumbers) {
        if (pageNumber >= firstPage && pageNumber <= lastPage)
            result.pages.push(await loadPage(document, pageNumber));
    }
    return result;
}

async function loadPage(document: PDFDocumentProxy, pageNumber: number) {
    const page = await document.getPage(pageNumber);
    const viewport = page.getViewport({ scale: 1.0 });
    const resultPage: PDFExtractPage = {
        pageInfo: {
            num: pageNumber,
            scale: viewport.scale,
            rotation: viewport.rotation,
            offsetX: viewport.offsetX,
            offsetY: viewport.offsetY,
            width: viewport.width,
            height: viewport.height
        },
        links: [],
        content: []
    };

    const annotations = await page.getAnnotations();
    resultPage.links = annotations
        .filter((annotation) => annotation.subtype === "Link" && !!annotation.url)
        .map((link) => link.url);

    const textContent = await page.getTextContent();
    resultPage.content = textContent.items.map((item) => {
        item = item as TextItem;
        const tx = Util.transform(viewport.transform, item.transform);
        return {
            x: tx[4],
            y: tx[5] - item.height,
            text: item.str,
            dir: item.dir,
            width: item.width,
            height: item.height,
            fontName: item.fontName
        };
    });

    return resultPage;
}

interface PDFExtractResult {
    filename?: string;
    meta?: {
        info?: {
            PDFFormatVersion?: string;
            IsAcroFormPresent?: boolean;
            IsCollectionPresent?: boolean;
            IsLinearized?: boolean;
            IsXFAPresent?: boolean;
            Title?: string;
            Author?: string;
            Creator?: string;
            Producer?: string;
            CreationDate?: string;
            ModDate?: string;
        };
        metadata?: {
            [name: string]: string;
        };
    };
    pages: Array<PDFExtractPage>;
    pdfInfo: {
        numPages: number;
        fingerprint: string;
    };
}

interface PDFExtractPage {
    pageInfo: {
        num: number;
        scale: number;
        rotation: number;
        offsetX: number;
        offsetY: number;
        width: number;
        height: number;
    };
    links: Array<string>;
    content: Array<PDFExtractText>;
}

interface PDFExtractText {
    x: number;
    y: number;
    text: string;
    dir: string;
    width: number;
    height: number;
    fontName: string;
}

ExtractorTypes.ts:

export interface TextBox {
    x: number;
    y: number;
    width: number;
    height: number;
    pageNumber: number;
    text: string;
}
sujjeee commented 3 months ago

@MCMattia Oh, okay, I see you are using pdfjs-dist. I also tried pdfjs-dist, but it feels slow to me in getting the Y-coordinate of a text. So basically, I was searching for the Y-coordinate of a given text from all pages of the PDF.

Nowadays, everyone is talking about Cloudflare Workers, so I thought to use this in a serverless environment, but it's not working.

sujjeee commented 3 months ago

Hey @MCMattia, I'm trying to use pdfjs-dist in my Next.js app.

This is essentially a function:

this is basically a function

import * as pdfjsLib from "pdfjs-dist";

export async function crop(pdfBuffer: ArrayBuffer) {
  const pdfDoc: PDFDocumentProxy = await pdfjsLib.getDocument(pdfBuffer).promise;
  // other code 
}

I'm calling this function when a button is clicked, but before that, I'm getting this error:

Error: Element type is invalid. Received a promise that resolves to: undefined. Lazy element type must resolve to a class or function.

Please let me know if you have any solutions to fix this issue.