galkahana / HummusJS

Node.js module for high performance creation, modification and parsing of PDF files and streams
http://www.pdfhummus.com
Other
1.14k stars 169 forks source link

Question: Appending page with file attachments looses attachment #400

Closed sebastinez closed 5 years ago

sebastinez commented 5 years ago

Hi Gal,

Thank you very much for this great library! I'm still at awe at the power behind the manipulation and creation of pdf's with this library If I reach one day a level at which I think I can contribute I'll let you know.

Right know I'm trying to bend my mind around an issue I have with appending a pdf file to a page. I used your library to append to pdf files a title sheet with some text and some images. That works all like a charm. The problem I have is that the mentioned pdf files usually come with a file attachment (usually some editable document like .docx or xlsx), and after appending the document, this file doesn't show up.

I could find some references in your documentation regarding IndirectObjectReferences and CopyingContexts and low level objects, but honestly right know its getting over my head. Nevertheless when reading the pdf files I find some Object References, and stuff that seems bodies of data.

Right now my append workflow looks like this:

  const hummus = require("hummus"),
   streams = require("memory-streams");

  let buffer = fs.readFileSync("./test.pdf");
  let pdfAppendStream = new streams.WritableStream();

  let pdfAppend = hummus.createWriterToModify(
    new hummus.PDFRStreamForFile("./caratula.pdf"),
    new hummus.PDFStreamForResponse(pdfAppendStream),
    { log: "log.txt" }
  );

  pdfAppend.appendPDFPagesFromPDF(new hummus.PDFRStreamForBuffer(buffer));
  pdfAppend.end();

Could you maybe give me some information how I could extract a file attached to a pdf or copy it to the output file?

Thank you very much in advance for any help or directions!

I attach you here some example files for your reference: The file with a file attachment: test.pdf

An example title sheet caratula.pdf

Best regards! Sebastian

sebastinez commented 5 years ago

Hi Gal and folks, I got working some code, it's surely not the nicest solution but for now it get's the work done. If anyone is looking for something similar feel free to copy and edit!

Heavily inspired from the following example: https://github.com/galkahana/HummusJS/blob/master/tests/PDFParser.js

Once the iteration of PDFObjects finds a hummus.ePDFObjectStream it checks if the dictionary has a "Params" value and from there it gets the PDFStreamInput Object and reads it to a buffer. For my final use I push the filename (stored in the Desc Object Inside a Object Dictionary) and the obtained file buffer to an array which gets returned from the function.

function extractFiles(path) {
  let files = [];
  function iterateObjectTypes(inObject, inReader) {
    if (inObject.getType() == hummus.ePDFObjectDictionary) {
      var aDictionary = inObject.toPDFDictionary().toJSObject();
      Object.getOwnPropertyNames(aDictionary).forEach(function(element) {
        if (element === "Desc") {
          fileName = aDictionary.UF.value;
        }
        iterateObjectTypes(aDictionary[element], inReader);
      });
    } else if (inObject.getType() == hummus.ePDFObjectStream) {
      if (
        inObject
          .getDictionary()
          .toJSObject()
          .hasOwnProperty("Params")
      ) {
        let bufferArray = [];

        var readStream = pdfReader.startReadingFromStream(
          inObject.toPDFStream()
        );
        while (readStream.notEnded()) {
          let chunk = readStream.read(1000);
          bufferArray.push(...chunk);
        }
        if (!readStream.notEnded()) {
          let buffer = Buffer.from(bufferArray);
          files.push({ buffer: buffer, name: fileName });
        }
      }

      iterateObjectTypes(inObject.toPDFStream().getDictionary(), inReader);
    } else if (inObject.getType() == hummus.ePDFObjectIndirectObjectReference) {
      var objectID = inObject.toPDFIndirectObjectReference().getObjectID();
      if (!mIteratedObjectIDs.hasOwnProperty(objectID)) {
        mIteratedObjectIDs[objectID] = true;
        iterateObjectTypes(inReader.parseNewObject(objectID), inReader);
      }
    } else if (inObject.getType() == hummus.ePDFObjectArray) {
      inObject
        .toPDFArray()
        .toJSArray()
        .forEach(function(element) {
          iterateObjectTypes(element, inReader);
        });
    }
  }

  let mIteratedObjectIDs = {};
  var pdfReader = hummus.createReader(path);
  var catalog = pdfReader.queryDictionaryObject(pdfReader.getTrailer(), "Root");
  iterateObjectTypes(catalog, pdfReader);
  return files;
}

Best regards Sebastian