galkahana / HummusJS

Node.js module for high performance creation, modification and parsing of PDF files and streams
http://www.pdfhummus.com
Other
1.14k stars 169 forks source link

Extract Text and get Matrix of Word. #375

Open ghost opened 5 years ago

ghost commented 5 years ago

Hi, im trying to find a way how i can get the matrix of a specific word or regex. I used the text extraction example to get the row where my word is.

I don't know how i could use this to get the matrix of the word so i can draw the rectangle above the word instead of the whole row.

var hummus = require('hummus');
var _ = require('lodash');
var extractText = require('./lib/text-extraction');

function search(nameKey, myArray){
    for (var i=0; i < myArray.length; i++) {
        if (myArray[i].text.includes(nameKey)) {
            return myArray[i];
        }
    }
}

function runMe() {
    var fileToRun = './samples/sample.pdf';
    var pdfReader = hummus.createReader(fileToRun);

    // extract text for all pages
    // will return array matching pages array where each item is an array of text placements
    // each text placements is represented by an object which has the following structure:
    // {
    //      text: the text
    //      matrix: 6 numbers pdf matrix describing how the text is transformed in relation to the page (this includes position - translation)
    //      localBBox: 4 numbers box describing the text bounding box, before being transformed by matrix.
    //      globalBBox: 4 numbers box describing the text bounding box after transoformation, making it the bbox in relation to the page.
    // }
    var pagesPlacements = extractText(pdfReader);

    // flush the result
    //console.log('pages text placements',JSON.stringify(pagesPlacements,null,2));

    //Only return the Array that includes the characters 'zzzzz'
    var resultObject = search("zzzzz", pagesPlacements[0]);
    console.log(resultObject);

    // create new version of file with rectangles around the text based on extraction info
    // if it is correct will have red rectangles around every piece of text
    var pdfWriter = hummus.createWriterToModify(fileToRun,{modifiedFilePath:'./samples/test_out.pdf'});
    var pageModifier = new hummus.PDFPageModifier(pdfWriter,0);

    var cxt = pageModifier.startContext().getContext();

    cxt.q();
    cxt.cm.apply(cxt,resultObject.matrix);
    cxt.drawRectangle(resultObject.localBBox[0],resultObject.localBBox[1],resultObject.localBBox[2]-resultObject.localBBox[0],resultObject.localBBox[3]-resultObject.localBBox[1],{color:'Blue',width:1});
    cxt.Q();

    pageModifier.endContext().writePage();
    pdfWriter.end();
}

runMe();
galkahana commented 5 years ago

nice one. well the grouping is per the matrix. so it means that if you have more than the text that you need, then the "matrix" of it is the original matrix with translation to the text start (width of text before it) and end is translation of the text itself. you could probably tinker with the text extraction code to give you individual character boxes as an array in addition. and then you can do the math.

ghost commented 5 years ago

Can you help me modifying the extraction code?

DrogoNevets commented 4 years ago

@Ingokoepp did you ever manage to do this?

I am looking to find specific words within a PDF and then redact them.....

mohammedabualsoud commented 4 years ago

@Ingokoepp did you ever manage to do this?

I am looking to find specific words within a PDF and then redact them.....

I have same situation did you find any solutions?

DrogoNevets commented 4 years ago

Alas no, not to date anyway.

It has been de-prioritised for our use case for now (it will come back) but all our research suggested that basically, its a bitch to do!