Open carlosrafp opened 2 months ago
Trying to extract the text from the following pdf gives wrong characters.
input pdf: laudo.pdf
Fragment extracted from the 1st page: 䴀䄀吀䔀刀䤀䄀䰀 ⴀ 匀䄀一䜀唀䔀 䠀䔀䴀伀䜀刀䄀䴀䄀
Code used to extract text: async function extractPageTextsFromPdf(pdfBuffer: Buffer): Promise<string[]> { const pdfParser = new PDFParser(null, true, ''); function decodePdfPageTexts(texts: Text[]) { return decodeURIComponent( texts.map((t) =>t.R.map((tt) => tt.T).join(' ')).join(' ') ); } const texts: Promise<string[]> = new Promise((resolve, reject) => { pdfParser.on('pdfParser_dataReady', (pdfData) => { const { Pages: pages } = pdfData; const tx = pages.map((p) => p.Texts).map(decodePdfPageTexts); resolve(tx); }); pdfParser.on('pdfParser_dataError', (errData) => { reject(errData.parserError); }); pdfParser.parseBuffer(pdfBuffer); }); return texts; }
async function extractPageTextsFromPdf(pdfBuffer: Buffer): Promise<string[]> { const pdfParser = new PDFParser(null, true, ''); function decodePdfPageTexts(texts: Text[]) { return decodeURIComponent( texts.map((t) =>t.R.map((tt) => tt.T).join(' ')).join(' ') ); } const texts: Promise<string[]> = new Promise((resolve, reject) => { pdfParser.on('pdfParser_dataReady', (pdfData) => { const { Pages: pages } = pdfData; const tx = pages.map((p) => p.Texts).map(decodePdfPageTexts); resolve(tx); }); pdfParser.on('pdfParser_dataError', (errData) => { reject(errData.parserError); }); pdfParser.parseBuffer(pdfBuffer); }); return texts; }
Trying to extract the text from the following pdf gives wrong characters.
input pdf: laudo.pdf
Fragment extracted from the 1st page: 䴀䄀吀䔀刀䤀䄀䰀 ⴀ 匀䄀一䜀唀䔀 䠀䔀䴀伀䜀刀䄀䴀䄀
Code used to extract text:
async function extractPageTextsFromPdf(pdfBuffer: Buffer): Promise<string[]> { const pdfParser = new PDFParser(null, true, ''); function decodePdfPageTexts(texts: Text[]) { return decodeURIComponent( texts.map((t) =>t.R.map((tt) => tt.T).join(' ')).join(' ') ); } const texts: Promise<string[]> = new Promise((resolve, reject) => { pdfParser.on('pdfParser_dataReady', (pdfData) => { const { Pages: pages } = pdfData; const tx = pages.map((p) => p.Texts).map(decodePdfPageTexts); resolve(tx); }); pdfParser.on('pdfParser_dataError', (errData) => { reject(errData.parserError); }); pdfParser.parseBuffer(pdfBuffer); }); return texts; }