正如标题所述,我正在尝试使用Mozilla维护的pdf.js从PDF提取文本。我知道以前关于stackoverflow的问题,但是我不确定从哪里开始。
我尝试遵循此article,但没有任何反应。我的整个页面空白。
我需要这方面的帮助。
所以我将下面的所有代码放到index.html文件中了吗?
<!DOCTYPE html>
<html lang="en">
<head>
<title></title>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<h1>PDF.js</h1>
<script src="/path/to/pdf.js"></script>
<script>
var urlPDF = '/path/to/example.pdf';
PDFJS.workerSrc = '/path/to/pdf.worker.js';
PDFJS.getDocument(urlPDF).then(function (pdf) {
var pdfDocument = pdf;
var pagesPromises = [];
for (var i = 0; i < pdf.pdfInfo.numPages; i++) {
// Required to prevent that i is always the total of pages
(function (pageNumber) {
pagesPromises.push(getPageText(pageNumber, pdfDocument));
})(i + 1);
}
Promise.all(pagesPromises).then(function (pagesText) {
// Display text of all the pages in the console
console.log(pagesText);
});
}, function (reason) {
// PDF loading error
console.error(reason);
});
/**
* Retrieves the text of a specif page within a PDF Document obtained through pdf.js
*
* @param {Integer} pageNum Specifies the number of the page
* @param {PDFDocument} PDFDocumentInstance The PDF document obtained
**/
function getPageText(pageNum, PDFDocumentInstance) {
// Return a Promise that is solved once the text of the page is retrieven
return new Promise(function (resolve, reject) {
PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
// The main trick to obtain the text of the PDF page, use the getTextContent method
pdfPage.getTextContent().then(function (textContent) {
var textItems = textContent.items;
var finalString = "";
// Concatenate the string of the item to the final string
for (var i = 0; i < textItems.length; i++) {
var item = textItems[i];
finalString += item.str + " ";
}
// Solve promise with the text retrieven from the page
resolve(finalString);
});
});
});
}
</script>