是否可以通过角度提取PDF中的所有文本?
我在寻找任何解决方案,如何从PDF中提取文本,但似乎没人能正常工作。
我尝试过这个:
// If absolute URL from the remote server is provided, configure the CORS
// header on that server.
var url = '//cdn.mozilla.net/pdfjs/tracemonkey.pdf';
// The workerSrc property shall be specified.
PDFJS.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.js';
PDFJS.getDocument(url).then(function (pdf) {
var pdfDocument = pdf;
var pagesPromises = [];
for (var i = 0; i < pdf.pdfInfo.numPages; i++) {
// Required to prevent that i is always the total of pages
(function (pageNumber) {
pagesPromises.push(getPageText(pageNumber, pdfDocument));
})(i + 1);
}
Promise.all(pagesPromises).then(function (pagesText) {
// Remove loading
$("#loading-info").remove();
// Render text
for(var i = 0;i < pagesText.length;i++){
$("#pdf-text").append("<div><h3>Page "+ (i + 1) +"</h3><p>"+pagesText[i]+"</p><br></div>")
}
});
}, function (reason) {
// PDF loading error
console.error(reason);
});
/**
* Retrieves the text of a specif page within a PDF Document obtained through pdf.js
*
* @param {Integer} pageNum Specifies the number of the page
* @param {PDFDocument} PDFDocumentInstance The PDF document obtained
**/
function getPageText(pageNum, PDFDocumentInstance) {
// Return a Promise that is solved once the text of the page is retrieven
return new Promise(function (resolve, reject) {
PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
// The main trick to obtain the text of the PDF page, use the getTextContent method
pdfPage.getTextContent().then(function (textContent) {
var textItems = textContent.items;
var finalString = "";
// Concatenate the string of the item to the final string
for (var i = 0; i < textItems.length; i++) {
var item = textItems[i];
finalString += item.str + " ";
}
// Solve promise with the text retrieven from the page
resolve(finalString);
});
});
});
}
<script src="//mozilla.github.io/pdf.js/build/pdf.js"></script>
<h1>PDF.js Extract PDF Text</h1>
<a href="http://ourcodeworld.com/articles/read/405/how-to-convert-pdf-to-text-extract-text-from-pdf-with-javascript">Read article here</a>
<div id="pdf-text">
<div id="loading-info">
Extracting text ... hold tight !
</div>
</div>
但是运行后,没有从PDF中提取文本。您还有其他想法如何从PDF提取文本吗?
我也在NPM pdf-text-extract中尝试,但是我不知道如何从npm导入模块到角