Open lisenkaci opened 9 months ago
Going to answer my own question. Could not figure out how to do it using the library so I implemented the following function using pdfjsLib
function extractText(pdfUrl) {
var pdf = pdfjsLib.getDocument(pdfUrl);
return pdf.promise.then(function (pdf) {
var totalPageCount = pdf.numPages;
var countPromises = [];
for (var currentPage = 1; currentPage <= totalPageCount; currentPage++) {
var page = pdf.getPage(currentPage);
countPromises.push(
page.then(function (page) {
var textContent = page.getTextContent();
return textContent.then(function (text) {
return text.items
.map(function (s) {
return s.str;
})
.join("");
});
})
);
}
return Promise.all(countPromises).then(function (texts) {
return texts.join("");
});
});
}
and called it in the onDocumentLoad. This worked and I am getting all the text content from the PDF.
I need to extract all the text content from a PDF as soon as it's loaded. I can't find the text value in the onDocumentLoad props and using renderPage renderPageProps.textLayerRendered only gives the text content for the currently scrolling page. I need ALL the text found in the PDF as soon as it is available. Thank you.