Hopding / pdf-lib

Create and modify PDF documents in any JavaScript environment
https://pdf-lib.js.org
MIT License
6.93k stars 664 forks source link

Copying pages to a new PDF document brings over all images in the Resources/XObject section, even those not used on the page #1662

Open seisiuneer opened 3 months ago

seisiuneer commented 3 months ago

What were you trying to do?

I am working on splitting a PDF document (PDF of music scores generated with a music transcription tool I've built) into individual page ranges, using a common pattern I've seen recommended for doing this sort of thing with pdf-lib.

How did you attempt to do it?

async function splitPDF(pdfBytes, ranges) {

const { PDFDocument } = PDFLib;

var originalPdf = await PDFDocument.load(pdfBytes);

const splitPdfs = [];

for (const range of ranges) {

  //console.log("splitPDF range start: "+range.start+" end: "+range.end);

  const newPdf = await PDFDocument.create();

  for (let i = range.start; i <= range.end; i++) {

    const [copiedPage] = await newPdf.copyPages(originalPdf, [i]);

    newPdf.addPage(copiedPage);

  }

  const newPdfBytes = await newPdf.save();

  splitPdfs.push(newPdfBytes);

}

return splitPdfs;

}

What actually happened?

Unfortunately, what I find in the split files that get written out is that all of the images referenced in the original PDF are present in the split PDF files, and I see entries for them in the context indirectObjects. The split files are essentially all the same size as the original complete PDF.

What did you expect to happen?

It looks like copyPages() doesn't filter out the unused images, it just copies the entire set of images referenced in the original PDF document you're copying from.

If I look at the actual operators using a PDF parser, I can see they only reference the images being used for the page range, but the resulting PDF files are all essentially the size of the original PDF file before the split.

I've seen a few posts about issues with file size using copyPages() to split the files, and I'm guessing this is the root cause.

How can we reproduce the issue?

Take a existing PDF file that has many images and try to split it into individual files. I've attached a typical example of the sort of PDF generated by my tool that I'm trying to split into individual PDFs per page. Retreat_Tunes_Played_Slowly_2024_Standard_Notation.pdf

Version

1.17.1

What environment are you running pdf-lib in?

Browser

Checklist

Additional Notes

No response

seisiuneer commented 3 months ago

Here's an example of the results of splitting a PDF with several pages, this is the first page. If you look at the resources, you'll see it has references to many unused images. My Darling Asleep.pdf

seisiuneer commented 3 months ago

Compare that to this version of the same tune, but this was from a version of the same PDF tunebook that only had the one tune. No extra unused image resources are present. My Darling Asleep.pdf

seisiuneer commented 3 months ago

This was the original PDF fed to the splitter that had multiple pages of images. Retreat_Tunes_Played_Slowly_2024_Standard_Notation.pdf

seisiuneer commented 3 months ago

I was finally able to split the pages by range and delete unused images before exporting the split pages, this returns the pdf bytes for a range of pages in an original PDF with unused images stripped.

Now, this works for me because I specifically know how I'm creating the original PDF using jsPDF and know their deterministic structure, it may not be a general solution. I figured that I needed to be able to get at the list of XObject images used in the document, a way to delete them, and a way to get at the raw command stream for the page, from which I could figure out which images are actually used in the split pages and delete the rest.

splitPDF(originalPdf, range) takes in a PDFDocument and a {start:startpage, end:endpage} range and returns the bytes for the new split PDF document for saving or other processing. In my case, I just put them in a Blob and save the file (code not provided here).

function countImagesInPDF(dict) {

    const entries = Array.from(dict.dict.entries());

    var nImages = 0;

    var nEntries = entries.length;
    for (var i = 0; i < nEntries; ++i) {
        var thisEntry = entries[i];

        if (thisEntry[0].encodedName.indexOf("/I") != -1) {

            nImages++
        }
    }

    return nImages;

}

const findKeyForValue = (value, dict) => {

    //debugger;

    const entries = Array.from(dict.dict.entries());

    var match = null;

    var nEntries = entries.length;
    for (var i = 0; i < nEntries; ++i) {
        var thisEntry = entries[i];

        if (thisEntry[0].encodedName == value) {

            match = thisEntry;
            break;
        }
    }

    if (match) return match[0];

    return undefined;
};

// Parse the content stream for this page and find the images
function getImagesInThisPage(thePage){

    var theContents = thePage.node.Contents();

    var decoder = new TextDecoder('utf-8');
    var rawString = decoder.decode(theContents.contents);

    //console.log(rawString);

    const lines = rawString.split('\n');

    // Filter lines that start with '/I'
    const filteredLines = lines.filter(line => line.startsWith('/I'));

    var nLines = filteredLines.length;

    var imageList = [];

    for (var i=0;i<nLines;++i){

        var thisLine = filteredLines[i];

        thisLine = thisLine.replace(" Do","");
        thisLine = thisLine.replace("/I","");
        thisLine = thisLine.trim();

        imageList.push(parseInt(thisLine));
    }

    return imageList;
}

async function splitPDF(originalPdf, range) {

    //debugger;

    const {
        PDFDocument,
        PDFName,
        PDFDict
    } = PDFLib;

    //console.log("splitPDF range start: " + range.start + " end: " + range.end );

    const newPdf = await PDFDocument.create();

    var newPDFPageCount = 0;

    for (let i = range.start; i <= range.end; i++) {

        const [copiedPage] = await newPdf.copyPages(originalPdf, [i]);

        newPdf.addPage(copiedPage);

        newPDFPageCount++;

    }

    for (let i=0;i<newPDFPageCount;++i){

        const thisPage = newPdf.getPages()[i];

        const xObjects = thisPage.node
            .Resources()
            .lookup(PDFName.of('XObject'), PDFDict);

        var nImagesInPDF = countImagesInPDF(xObjects);

        //console.log("Image count in PDF: " + nImagesInPDF);

        var imagesInThisPDF = getImagesInThisPage(thisPage);

        // Get all the images in the command stream

        for (var j = 0; j < nImagesInPDF; ++j) {

            if (!(imagesInThisPDF.includes(j))){

                const key = findKeyForValue('/I' + j, xObjects);

                const imageRef = xObjects.get(key);

                if (imageRef) {

                    //console.log("deleting "+ ('/I' + j));

                    newPdf.context.delete(imageRef);

                }
            }
        }
    }

    newPdfBytes = await newPdf.save();

    return newPdfBytes;

}