wojtekmaj / react-pdf

Display PDFs in your React app as easily as if they were images.
https://projects.wojtekmaj.pl/react-pdf
MIT License
9.24k stars 877 forks source link

CustomTextRenderer not getting all text #1622

Closed pk-bt closed 11 months ago

pk-bt commented 11 months ago

Before you start - checklist

Description

When I use custom text renderer, all the text is not rendered. This behavior is not with all documents

Steps to reproduce

Opened this document: 1692367633856-RSB-Journal_3_23.pdf

I logged the result from the customTextRenderer and from onGetTextSuccess, when the page is rendered, the renderer shows only part of all the text on the page and onGetTextSuccess I can see all the text the page contains.

Expected behavior

CustomTextRenderer to render all TextItems

Actual behavior

CustomTextRenderer does not render all the text.

Additional information

My code;

import {Document, Page, pdfjs} from "react-pdf";
import React, {useCallback, useEffect, useMemo, useRef, useState} from "react";
import "pdfjs-dist/build/pdf.worker.entry";
import 'react-pdf/dist/Page/TextLayer.css';
import 'react-pdf/dist/Page/AnnotationLayer.css';
import "./PDFViewer.scss";
import {TextItem} from "pdfjs-dist/types/src/display/api";
import {Spin, Input} from "antd";

pdfjs.GlobalWorkerOptions.workerSrc = new URL(
    'pdfjs-dist/build/pdf.worker.min.js',
    import.meta.url,
).toString();

type TPDFViewerProps = {
    fileUrl?: string,
    pageLoading: "all" | "scroll-load",
    height?: number,
}

const PDFViewer = (props: TPDFViewerProps) => {
    const [pages, setPages] = useState<number[]>([])
    const contRef = useRef<HTMLDivElement>(null)
    const [pagesLoaded, setPagesLoaded] = useState(0)
    const [searchText, setSearchText] = useState("")
    const [pagesTotal, setPagesTotal] = useState(0)
    const [loadPages, setLoadPages] = useState(false)
    const [pagesCompleted, setPagesCompleted] = useState(0)
    const canvasRef = useRef<(HTMLCanvasElement | null )[]>([])

    const onDocumentLoadSuccess = ({numPages}: {
        numPages: number
    }) => {
        setPagesTotal(numPages)
        setLoadPages(true)
    };

    const onScroll = (e: React.UIEvent<HTMLElement, UIEvent>) => {
        if (props.pageLoading === "all")
            return
        const containerHeight = contRef.current?.clientHeight || 0
        if (e.currentTarget.scrollHeight - e.currentTarget.scrollTop === containerHeight) {
            setLoadPages(true)
        }

    };

    function highlightPattern(text: string, pattern: string) {
        if(!pattern)
            return ""
        return text.replace(pattern, (value) => `<mark>${value}</mark>`);
    }

    const textRenderer = useMemo(() => (textItem: TextItem) => {
            console.log("CustomTextRenderer",textItem)
           return  highlightPattern(textItem.str, searchText)
        },
        [searchText]
    );

    useEffect(() => {
        if (loadPages && pagesTotal > 0) {
            let n = 0
            const pagesToLoad = props.pageLoading === "all" ? pagesTotal : pagesLoaded + 1
            for (let i = pagesLoaded; i < pagesToLoad && pagesToLoad <= pagesTotal; i++) {
                setPages(current => [...current, i + 1])
                n++
            }
            setPagesLoaded(current => current + n)
            setLoadPages(false)
        }
    }, [loadPages, pagesLoaded, pagesTotal, props.pageLoading, textRenderer]);

    return (
        <Spin size={"large"}
              className={"pdf-viewer-spin-cont"}
              spinning={props.pageLoading === "all" && pagesCompleted < pagesTotal - 1}
              tip={"Dokument wird geladen, bitte warten"}
        >
            <div className={"pdf-viewer-navi"}>
                    <Input value={searchText}
                           onKeyDown={onInputOnKeyDown}
                           onChange={e => setSearchText(e.currentTarget.value)}

                    />

            </div>
            <div className={"pdf-viewer-cont"} ref={contRef} onScroll={onScroll}>
                <Document file={props.fileUrl}
                          onLoadSuccess={onDocumentLoadSuccess}
                          className={"pdf-viewer-document"}
                          loading={"Dokument wird geladen"}
                          noData={"Datei nicht gefunden"}
                          error={"Das Dokument konnte nicht geladen werden"}
                          options={options}
                >
                    {pages.map((page, i) => (
                        <Page className={"pdf-viewer-page"}
                              pageNumber={page}
                              customTextRenderer={textRenderer}
                              renderAnnotationLayer={true}
                              onLoadSuccess={() => setPagesCompleted(current => current + 1)}
                              loading={"Seite wird geladen"}
                              key={i}
                              error={"Die Seite konnte nicht geladen werden"}
                              onGetTextSuccess={({items}) => console.log("onGetTextSuccess",items)}
                              canvasRef={(ref) => canvasRef.current[i] = ref}
                        />))
                    }
                </Document>
            </div>
        </Spin>
    )
}

export default PDFViewer

The screenshot bellow is from the first page only, but it is the same for all pages in this document (total 20).

pdf-react

Environment

wojtekmaj commented 11 months ago

Duplicate of #1593

Although the test PDF is a really good example, so still, thanks for this.