galkahana / HummusJS

Node.js module for high performance creation, modification and parsing of PDF files and streams
http://www.pdfhummus.com
Other
1.14k stars 169 forks source link

How to update link annotation destinations after appending PDFs? #437

Closed zachesposito closed 4 years ago

zachesposito commented 4 years ago

Thanks to @galkahana for an extremely useful library. Just wanted to document how I solved this problem to help anyone else with the same need.

I have two PDFs, main.pdf and insert.pdf. I needed to insert insert.pdf into main.pdf at a specific page number. That is achievable by bisecting main.pdf at the insertion point and then appending the first part, then insert.pdf, and then the second part, to create output.pdf.

Then, I needed to make sure that link annotations in main.pdf's table of contents were preserved. That is achievable by following this example.

However, I noticed that when clicking a link after appending, the link destination is incorrectly offset by a number of pages equal to the length of insert.pdf. That makes sense, because the links were copied exactly from the original main.pdf, which didn't have the pages from insert.pdf.

To fix the offset, I needed to update the destination of all link annotations in main.pdf. Here's the set of functions I came up with to do so, drawing heavily from this test:

function updateLinkDestinations(PDFPath, insertPDFLength) {
  let writer = hummus.createWriterToModify(PDFPath);
  let reader = writer.getModifiedFileParser(PDFPath);
  let copyingContext = writer.createPDFCopyingContextForModifiedFile();
  let pageIDs = getPageIDs(reader);

  for (let i = 0; i < reader.getPagesCount(); i++) {
    let pageDictionary = reader.parsePageDictionary(i);
    if (pageDictionary.exists("Annots")) {
      let parsedPageDictionary = reader.parsePageDictionary(i);
      let annots = reader.queryDictionaryObject(parsedPageDictionary, "Annots");

      for (let j = 0; j < annots.getLength(); j++) {
        let annotationIndirectReference = annots.queryObject(j);
        let annotation = reader.queryArrayObject(annots, j);
        let annotationObject = annotation.toJSObject();
        let destPDFArray = reader.queryDictionaryObject(annotation, "Dest");
        let destArrayObject = destPDFArray.toJSArray();
        let oldDestPageID = destArrayObject[0].getObjectID();
        let oldDestPageIndex = getOldPageIDIndexInOldPDF(reader, oldDestPageID);
        let newDestPageID = pageIDs[oldDestPageIndex + insertPDFLength];

        let objectContext = writer.getObjectsContext();
        objectContext.startModifiedIndirectObject(
          annotationIndirectReference.getObjectID()
        );
        var modifiedAnnotation = writer.getObjectsContext().startDictionary();

        //copy all keys except Dest to the modified annotation
        Object.getOwnPropertyNames(annotationObject).forEach(
          (element, index, array) => {
            if (element != "Dest") {
              modifiedAnnotation.writeKey(element);
              copyingContext.copyDirectObjectAsIs(annotationObject[element]);
            }
          }
        );

        //Add the Dest key and make it an array with the first element being the new target page
        modifiedAnnotation.writeKey("Dest");
        objectContext.startArray().writeIndirectObjectReference(newDestPageID);

        //copy other elements of the old Dest array
        for (let k = 1; k < destArrayObject.length; k++) {
          copyingContext.copyDirectObjectAsIs(destArrayObject[k]);
        }

        objectContext
          .endArray()
          .endLine()
          .endDictionary(modifiedAnnotation)
          .endIndirectObject();
      }
    }
  }

  writer.end();
}

function getPageIDs(reader){
  let IDs = [];        
  for (let i = 0; i < reader.getPagesCount(); i++){
      IDs.push(reader.getPageObjectID(i));
  }
  return IDs;
}

function getOldPageIDIndexInOldPDF(reader, oldPageID){
  let oldPageDict = reader.parseNewObject(oldPageID).toPDFDictionary();
  let parent = reader.queryDictionaryObject(oldPageDict, 'Parent').toJSObject();
  let oldPageIDs = parent.Kids.toJSArray().map(e => e.getObjectID());
  return oldPageIDs.indexOf(oldPageID);
}

The trickiest part was understanding that the links were remembering the page IDs from main.pdf, and those page IDs won't be found when iterating through output.pdf's pages, so to get the index of the page with oldDestPageID, you have to refer to the old page's parent's Kids key, which is an array of pages as they were in main.pdf. Then, you can add the length of insert.pdf to the old page's index to get the index of the correct page to link to in output.pdf.

I will make a PR to the examples repo to provide a full working example.