从PDF角度提取文本

时间:2018-09-18 10:30:56

标签: javascript angularjs node.js pdf

是否可以通过角度提取PDF中的所有文本?

我在寻找任何解决方案,如何从PDF中提取文本,但似乎没人能正常工作。

我尝试过这个:

// If absolute URL from the remote server is provided, configure the CORS
// header on that server.
var url = '//cdn.mozilla.net/pdfjs/tracemonkey.pdf';

// The workerSrc property shall be specified.
PDFJS.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.js';

PDFJS.getDocument(url).then(function (pdf) {
    var pdfDocument = pdf;
    var pagesPromises = [];

    for (var i = 0; i < pdf.pdfInfo.numPages; i++) {
        // Required to prevent that i is always the total of pages
        (function (pageNumber) {
            pagesPromises.push(getPageText(pageNumber, pdfDocument));
        })(i + 1);
    }

    Promise.all(pagesPromises).then(function (pagesText) {
    		// Remove loading
        $("#loading-info").remove();
        
        // Render text
        for(var i = 0;i < pagesText.length;i++){
        	$("#pdf-text").append("<div><h3>Page "+ (i + 1) +"</h3><p>"+pagesText[i]+"</p><br></div>")
        }
    });

}, function (reason) {
    // PDF loading error
    console.error(reason);
});


/**
 * Retrieves the text of a specif page within a PDF Document obtained through pdf.js 
 * 
 * @param {Integer} pageNum Specifies the number of the page 
 * @param {PDFDocument} PDFDocumentInstance The PDF document obtained 
 **/
function getPageText(pageNum, PDFDocumentInstance) {
    // Return a Promise that is solved once the text of the page is retrieven
    return new Promise(function (resolve, reject) {
        PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
            // The main trick to obtain the text of the PDF page, use the getTextContent method
            pdfPage.getTextContent().then(function (textContent) {
                var textItems = textContent.items;
                var finalString = "";

                // Concatenate the string of the item to the final string
                for (var i = 0; i < textItems.length; i++) {
                    var item = textItems[i];

                    finalString += item.str + " ";
                }

                // Solve promise with the text retrieven from the page
                resolve(finalString);
            });
        });
    });
}
<script src="//mozilla.github.io/pdf.js/build/pdf.js"></script>

<h1>PDF.js Extract PDF Text</h1>

<a href="http://ourcodeworld.com/articles/read/405/how-to-convert-pdf-to-text-extract-text-from-pdf-with-javascript">Read article here</a>
 
<div id="pdf-text">
  <div id="loading-info">
    Extracting text ... hold tight !
  </div>
</div>

但是运行后,没有从PDF中提取文本。您还有其他想法如何从PDF提取文本吗?

我也在NPM pdf-text-extract中尝试,但是我不知道如何从npm导入模块到角

0 个答案:

没有答案