modesty / pdf2json

converts binary PDF to JSON and text, for server-side PDF processing and command-line use.
https://github.com/modesty/pdf2json
Other
2.02k stars 377 forks source link

getRawTextContent() method not consistent in multiple sequential calls #151

Open Fazila-A opened 6 years ago

Fazila-A commented 6 years ago

This code parses two pdf files and converts to rawtext and removes the line starting with 'Generated' and then compares those two text files.This method is called more than once sequentially and for each call the arguments to the method changes, but the raw content is not getting replaced for the last call.


function compareGeneratedReportContent(samplePDFFile, sampleXLSXText) {
    const pdfParser = new PDFParser(this, true)
    const pdfParser2 = new PDFParser(this, true)
    const pdfExportPath = path.join(__dirname, '../../resources/report-test/PDFExportedFile.pdf')
    const xlsxExportPath = path.join(__dirname, '../../resources/report-test/XLSXExportedFile.xlsx')
    let content1 = ''
    let content2 = ''
    let result = false

    pdfParser2.on('pdfParser_dataError', errData => console.log(errData))
    pdfParser2.on('pdfParser_dataReady', () => {
        content2 = pdfParser2.getRawTextContent().replace(/^.*(Generated).+$/mg, '')
        // console.log('content2 ', content2)
        fs.writeFileSync(path.join(__dirname, '../../resources/report-test/sample.txt'), content2, 'utf-8')
    })
    pdfParser2.loadPDF(path.join(__dirname, `../../resources/report-test/${samplePDFFile}.pdf`))

    pdfParser.on('pdfParser_dataError', errData => console.log(errData))
    pdfParser.on('pdfParser_dataReady', () => {
        content1 = pdfParser.getRawTextContent().replace(/^.*(Generated).+$/mg, '')
        // console.log('content1 ', content1)
        fs.writeFileSync(path.join(__dirname, '../../resources/report-test/generated.txt'), content2, 'utf-8')
    })
    pdfParser.loadPDF(pdfExportPath)

    let readContent2 = fs.readFileSync(path.join(__dirname, '../../resources/report-test/sample.txt'), 'utf-8')
    let readContent1 = fs.readFileSync(path.join(__dirname, '../../resources/report-test/generated.txt'), 'utf-8')
    if (readContent2 === readContent1) {
        console.log('Report pdf file content matches')
        result = true
    } else {
        console.log('Error in matching contents of report pdf')
        result = false
    }
}

```Can anybody help?
Regards,
Fazi
NatanB4 commented 1 year ago

already solved?