mozilla / pdf.js

PDF Reader in JavaScript
https://mozilla.github.io/pdf.js/
Apache License 2.0
47.23k stars 9.83k forks source link

getTextContent is breaking up text in unexpected ways - very inconsistent #18201

Open rojithaDev opened 1 month ago

rojithaDev commented 1 month ago

Attach (recommended) or Link to PDF file here: input.pdf input2.pdf

Configuration:

Steps to reproduce the problem:

  1. Load a pdf and then a page
  2. call getTextContent on a page

import { getDocument } from "pdfjs-dist"; const loadingTask = getDocument(pdfPath); const pdfDocument = await loadingTask.promise; const page = await pdfDocument.getPage(1); const tempTextContent = await page.getTextContent({});


What is the expected behavior? Unsure what the expectation here should be. What is being used as a delimiter? I would expect it to break at spaces, returns and (some special characters?) I would expect maybe a whole word to be treated as 1 item in the text content array.

What went wrong? In the first input 2 pdf it is extracting entire lines without using spaces as delimiter and in input 1 pdf it is breaking at space, returns and also at hyphens

[ { str: "Jo", dir: "ltr", width: 10.597200000000011, height: 12, transform: [ 12, 0, 0, 12, 72, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "hns", dir: "ltr", width: 19.06320000000001, height: 12, transform: [ 12, 0, 0, 12, 82.593744, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.2029040000000002, height: 0, transform: [ 12, 0, 0, 12, 101.656944, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "Email", dir: "ltr", width: 29.271600000000003, height: 12, transform: [ 12, 0, 0, 12, 104.091792, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.20327800000000215, height: 0, transform: [ 12, 0, 0, 12, 133.36339199999998, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "is", dir: "ltr", width: 8.697600000000005, height: 12, transform: [ 12, 0, 0, 12, 135.802728, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: ": john.doe@example.com", dir: "ltr", width: 133.41959999999992, height: 12, transform: [ 12, 0, 0, 12, 144.49804799999998, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "", dir: "ltr", width: 0, height: 0, transform: [ 12, 0, 0, 12, 72, 694.0799999999999, ], fontName: "g_d1_f1", hasEOL: true, }, { str: "Phone: 123-456-7890", dir: "ltr", width: 110.13479999999996, height: 12, transform: [ 12, 0, 0, 12, 72, 694.0799999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "", dir: "ltr", width: 0, height: 0, transform: [ 12, 0, 0, 12, 72, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: true, }, { str: "SSN: 123-45-6789", dir: "ltr", width: 92.6772, height: 12, transform: [ 12, 0, 0, 12, 72, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "", dir: "ltr", width: 0, height: 0, transform: [ 12, 0, 0, 12, 72, 664.8, ], fontName: "g_d1_f1", hasEOL: true, }, { str: "My credit card number is 4111 1111 1111 1111", dir: "ltr", width: 239.80559999999986, height: 12, transform: [ 12, 0, 0, 12, 72, 664.8, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "", dir: "ltr", width: 0, height: 0, transform: [ 12, 0, 0, 12, 72, 650.16, ], fontName: "g_d1_f1", hasEOL: true, }, { str: "John has diabetes and is on medication for it.", dir: "ltr", width: 232.0800000000001, height: 12, transform: [ 12, 0, 0, 12, 72, 650.16, ], fontName: "g_d1_f1", hasEOL: false, }, ]


[ { str: "This", dir: "ltr", width: 21.054000000000016, height: 12, transform: [ 12, 0, 0, 12, 72, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.20301999999999865, height: 0, transform: [ 12, 0, 0, 12, 93.054, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "is", dir: "ltr", width: 8.69759999999999, height: 12, transform: [ 12, 0, 0, 12, 95.49023999999999, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.20293400000000025, height: 0, transform: [ 12, 0, 0, 12, 104.18784, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "an example PDF file.", dir: "ltr", width: 105.53399999999995, height: 12, transform: [ 12, 0, 0, 12, 106.623048, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.20257000000000383, height: 0, transform: [ 12, 0, 0, 12, 212.15704799999995, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "Name:", dir: "ltr", width: 34.83959999999995, height: 12, transform: [ 12, 0, 0, 12, 214.587888, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.2026560000000046, height: 0, transform: [ 12, 0, 0, 12, 249.42748799999993, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "John Doe", dir: "ltr", width: 47.44080000000011, height: 12, transform: [ 12, 0, 0, 12, 251.85935999999998, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: ".", dir: "ltr", width: 3.4320000000000004, height: 12, transform: [ 12, 0, 0, 12, 299.29679999999996, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.20277999999999943, height: 0, transform: [ 12, 0, 0, 12, 302.7288, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "SSN:", dir: "ltr", width: 25.48439999999988, height: 12, transform: [ 12, 0, 0, 12, 305.16216, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.20296000000001393, height: 0, transform: [ 12, 0, 0, 12, 330.64655999999985, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "123", dir: "ltr", width: 19.228799999999993, height: 12, transform: [ 12, 0, 0, 12, 333.08208, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "-", dir: "ltr", width: 4.08, height: 12, transform: [ 12, 0, 0, 12, 352.31256, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "45", dir: "ltr", width: 12.818399999999968, height: 12, transform: [ 12, 0, 0, 12, 356.39639999999997, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "-", dir: "ltr", width: 4.08, height: 12, transform: [ 12, 0, 0, 12, 369.21672, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "6789", dir: "ltr", width: 25.63920000000001, height: 12, transform: [ 12, 0, 0, 12, 373.3008, 708.7199999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "", dir: "ltr", width: 0, height: 0, transform: [ 12, 0, 0, 12, 72, 694.0799999999999, ], fontName: "g_d1_f1", hasEOL: true, }, { str: "Email:", dir: "ltr", width: 32.707199999999986, height: 12, transform: [ 12, 0, 0, 12, 72, 694.0799999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.20262200000000044, height: 0, transform: [ 12, 0, 0, 12, 104.70719999999999, 694.0799999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "johndoe@example.com", dir: "ltr", width: 124.12799999999999, height: 12, transform: [ 12, 0, 0, 12, 107.13866399999999, 694.0799999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "", dir: "ltr", width: 0, height: 0, transform: [ 12, 0, 0, 12, 72, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: true, }, { str: "Phone:", dir: "ltr", width: 36.534000000000006, height: 12, transform: [ 12, 0, 0, 12, 72, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.20256999999999792, height: 0, transform: [ 12, 0, 0, 12, 108.53400000000002, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "(", dir: "ltr", width: 3.516, height: 12, transform: [ 12, 0, 0, 12, 110.96484, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "555", dir: "ltr", width: 19.228800000000003, height: 12, transform: [ 12, 0, 0, 12, 114.48048, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: ")", dir: "ltr", width: 3.516, height: 12, transform: [ 12, 0, 0, 12, 133.710936, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.2030940000000001, height: 0, transform: [ 12, 0, 0, 12, 137.226936, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "123", dir: "ltr", width: 19.228800000000017, height: 12, transform: [ 12, 0, 0, 12, 139.664064, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "-", dir: "ltr", width: 4.08, height: 12, transform: [ 12, 0, 0, 12, 158.89452, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "4567", dir: "ltr", width: 25.639199999999985, height: 12, transform: [ 12, 0, 0, 12, 162.978504, 679.4399999999999, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "", dir: "ltr", width: 0, height: 0, transform: [ 12, 0, 0, 12, 72, 664.8, ], fontName: "g_d1_f1", hasEOL: true, }, { str: "Address:", dir: "ltr", width: 45.93360000000004, height: 12, transform: [ 12, 0, 0, 12, 72, 664.8, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.204425999999998, height: 0, transform: [ 12, 0, 0, 12, 117.93360000000003, 664.8, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "123", dir: "ltr", width: 19.22879999999999, height: 12, transform: [ 12, 0, 0, 12, 120.386712, 664.8, ], fontName: "g_d1_f1", hasEOL: false, }, { str: " ", dir: "ltr", width: 0.20326399999999722, height: 0, transform: [ 12, 0, 0, 12, 139.61551200000002, 664.8, ], fontName: "g_d1_f1", hasEOL: false, }, { str: "Main St, Anytown, USA", dir: "ltr", width: 117.08279999999996, height: 12, transform: [ 12, 0, 0, 12, 142.05468, 664.8, ], fontName: "g_d1_f1", hasEOL: false, }, ]