使用PDF.js从PDF提取文本(2019)

时间:2019-03-18 00:50:43

标签: javascript pdf.js

正如标题所述,我正在尝试使用Mozilla维护的pdf.js从PDF提取文本。我知道以前关于stackoverflow的问题,但是我不确定从哪里开始。

我尝试遵循此article,但没有任何反应。我的整个页面空白。

我需要这方面的帮助。

所以我将下面的所有代码放到index.html文件中了吗?

<!DOCTYPE html>
<html lang="en">

<head>
    <title></title>
    <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>

<body>
<h1>PDF.js</h1>

<script src="/path/to/pdf.js"></script>
<script>
    var urlPDF = '/path/to/example.pdf';
    PDFJS.workerSrc = '/path/to/pdf.worker.js';

    PDFJS.getDocument(urlPDF).then(function (pdf) {
        var pdfDocument = pdf;
        var pagesPromises = [];

        for (var i = 0; i < pdf.pdfInfo.numPages; i++) {
            // Required to prevent that i is always the total of pages
            (function (pageNumber) {
                pagesPromises.push(getPageText(pageNumber, pdfDocument));
            })(i + 1);
        }

        Promise.all(pagesPromises).then(function (pagesText) {

            // Display text of all the pages in the console
            console.log(pagesText);
        });

    }, function (reason) {
        // PDF loading error
        console.error(reason);
    });


    /**
     * Retrieves the text of a specif page within a PDF Document obtained through pdf.js 
     * 
     * @param {Integer} pageNum Specifies the number of the page 
     * @param {PDFDocument} PDFDocumentInstance The PDF document obtained 
     **/
    function getPageText(pageNum, PDFDocumentInstance) {
        // Return a Promise that is solved once the text of the page is retrieven
        return new Promise(function (resolve, reject) {
            PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
                // The main trick to obtain the text of the PDF page, use the getTextContent method
                pdfPage.getTextContent().then(function (textContent) {
                    var textItems = textContent.items;
                    var finalString = "";

                    // Concatenate the string of the item to the final string
                    for (var i = 0; i < textItems.length; i++) {
                        var item = textItems[i];

                        finalString += item.str + " ";
                    }

                    // Solve promise with the text retrieven from the page
                    resolve(finalString);
                });
            });
        });
    }
</script>

0 个答案:

没有答案