在Javascript中从pdf中提取文本

时间:2009-10-12 12:26:28

标签: javascript pdf text

我想知道是否可以通过仅使用Javascript来获取PDF文件中的文本? 如果是的话,有人能告诉我怎么做?

我知道有一些服务器端java,c#等库,但我不想使用服务器。 感谢

7 个答案:

答案 0 :(得分:58)

这是一个古老的问题,但由于pdf.js多年来一直在发展,我想给出一个新的答案。也就是说,它可以在本地完成,而不涉及任何服务器或外部服务。新的pdf.js有一个函数:page.getTextContent()。您可以从中获取文本内容。我用以下代码成功完成了它。

  1. 您在每一步中得到的都是承诺。您需要以这种方式编码:.then( function(){...})以继续下一步。

    1)PDFJS.getDocument( data ).then( function(pdf) {

    2)pdf.getPage(i).then( function(page){

    3)page.getTextContent().then( function(textContent){

  2. 你最终得到的是一个字符串数组textContent.bidiTexts[]。您将它们连接起来以获取1页的文本。文本块的坐标用于判断是否需要插入换行符或空格。 (这可能不是很完整,但从我的测试来看似乎没问题。)

  3. 输入参数data必须是URL或ArrayBuffer类型数据。我使用FileReader API中的ReadAsArrayBuffer(文件)函数来获取数据。

  4. 希望这有帮助。

    注意:根据其他一些用户,该库已更新并导致代码中断。根据下面 async5 的评论,您需要将textContent.bidiTexts替换为textContent.items

        function Pdf2TextClass(){
         var self = this;
         this.complete = 0;
    
        /**
         *
         * @param data ArrayBuffer of the pdf file content
         * @param callbackPageDone To inform the progress each time
         *        when a page is finished. The callback function's input parameters are:
         *        1) number of pages done;
         *        2) total number of pages in file.
         * @param callbackAllDone The input parameter of callback function is 
         *        the result of extracted text from pdf file.
         *
         */
         this.pdfToText = function(data, callbackPageDone, callbackAllDone){
         console.assert( data  instanceof ArrayBuffer  || typeof data == 'string' );
         PDFJS.getDocument( data ).then( function(pdf) {
         var div = document.getElementById('viewer');
    
         var total = pdf.numPages;
         callbackPageDone( 0, total );        
         var layers = {};        
         for (i = 1; i <= total; i++){
            pdf.getPage(i).then( function(page){
            var n = page.pageNumber;
            page.getTextContent().then( function(textContent){
              if( null != textContent.bidiTexts ){
                var page_text = "";
                var last_block = null;
                for( var k = 0; k < textContent.bidiTexts.length; k++ ){
                    var block = textContent.bidiTexts[k];
                    if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
                        if( block.x < last_block.x )
                            page_text += "\r\n"; 
                        else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
                            page_text += ' ';
                    }
                    page_text += block.str;
                    last_block = block;
                }
    
                textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
                layers[n] =  page_text + "\n\n";
              }
              ++ self.complete;
              callbackPageDone( self.complete, total );
              if (self.complete == total){
                window.setTimeout(function(){
                  var full_text = "";
                  var num_pages = Object.keys(layers).length;
                  for( var j = 1; j <= num_pages; j++)
                      full_text += layers[j] ;
                  callbackAllDone(full_text);
                }, 1000);              
              }
            }); // end  of page.getTextContent().then
          }); // end of page.then
        } // of for
      });
     }; // end of pdfToText()
    }; // end of class
    

答案 1 :(得分:9)

我无法让gm2008的示例工作(pdf.js上的内部数据结构已经明显改变),所以我编写了自己完全基于承诺的解决方案,它不使用任何DOM元素,queryselectors或canvas,使用来自mozilla

示例的更新pdf.js

它使用了一个文件路径进行上传,因为我将它与node-webkit一起使用。 你需要确保你已经下载并指向某个地方的cmaps,你需要pdf.js和pdf.worker.js来实现这个目的。

    /**
     * Extract text from PDFs with PDF.js
     * Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
     */
    this.pdfToText = function(data) {

        PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
        PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
        PDFJS.cMapPacked = true;

        return PDFJS.getDocument(data).then(function(pdf) {
            var pages = [];
            for (var i = 0; i < pdf.numPages; i++) {
                pages.push(i);
            }
            return Promise.all(pages.map(function(pageNumber) {
                return pdf.getPage(pageNumber + 1).then(function(page) {
                    return page.getTextContent().then(function(textContent) {
                        return textContent.items.map(function(item) {
                            return item.str;
                        }).join(' ');
                    });
                });
            })).then(function(pages) {
                return pages.join("\r\n");
            });
        });
    }

用法:

 self.pdfToText(files[0].path).then(function(result) {
      console.log("PDF done!", result);
 })

答案 2 :(得分:7)

以下是一些使用http://hublog.hubmed.org/archives/001948.html中的Pdf.js执行您想要的JavaScript代码:

var input = document.getElementById("input");  
var processor = document.getElementById("processor");  
var output = document.getElementById("output");  

// listen for messages from the processor  
window.addEventListener("message", function(event){  
  if (event.source != processor.contentWindow) return;  

  switch (event.data){  
    // "ready" = the processor is ready, so fetch the PDF file  
    case "ready":  
      var xhr = new XMLHttpRequest;  
      xhr.open('GET', input.getAttribute("src"), true);  
      xhr.responseType = "arraybuffer";  
      xhr.onload = function(event) {  
        processor.contentWindow.postMessage(this.response, "*");  
      };  
      xhr.send();  
    break;  

    // anything else = the processor has returned the text of the PDF  
    default:  
      output.textContent = event.data.replace(/\s+/g, " ");  
    break;  
  }  
}, true);

......这是一个例子:

http://git.macropus.org/2011/11/pdftotext/example/

答案 3 :(得分:2)

注意:此代码假定您正在使用nodejs。这意味着您要解析的是本地文件而不是网页中的文件,因为原始问题并没有明确询问在网页上解析pdf的问题。

@ gm2008的答案是一个很好的起点(请阅读它及其注释以获取更多信息),但是需要一些更新(08/19)并且有一些未使用的代码。我也喜欢更完整的示例。可以进行更多的重构和调整(例如,使用await),但是目前它已经尽可能接近原始答案了。

和以前一样,它使用Mozilla的PDFjs库。 npmjs软件包位于https://www.npmjs.com/package/pdfjs-dist

以我的经验,这在寻找放置空间的位置上效果并不理想,但这又是一个问题。

[编辑:我相信对.transform使用的更新已恢复了空白,恢复了原来的状态。]

// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');

let pathToPDF = 'path/to/myPDFfileToText.pdf';

let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);

function Pdf2TextObj() {
    let self = this;
    this.complete = 0;

    /**
     *
     * @param path Path to the pdf file.
     * @param callbackPageDone To inform the progress each time
     *        when a page is finished. The callback function's input parameters are:
     *        1) number of pages done.
     *        2) total number of pages in file.
     *        3) the `page` object itself or null.
     * @param callbackAllDone Called after all text has been collected. Input parameters:
     *        1) full text of parsed pdf.
     *
     */
    this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
        // console.assert(typeof path == 'string');
        PDFJS.getDocument(path).promise.then(function(pdf) {

            let total = pdf.numPages;
            callbackPageDone(0, total, null);

            let pages = {};
            // For some (pdf?) reason these don't all come in consecutive
            // order. That's why they're stored as an object and then
            // processed one final time at the end.
            for (let pagei = 1; pagei <= total; pagei++) {
                pdf.getPage(pagei).then(function(page) {
                    let pageNumber = page.pageNumber;
                    page.getTextContent().then(function(textContent) {
                        if (null != textContent.items) {
                            let page_text = "";
                            let last_item = null;
                            for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
                                let item = textContent.items[itemsi];
                                // I think to add whitespace properly would be more complex and
                                // would require two loops.
                                if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
                                    let itemX = item.transform[5]
                                    let lastItemX = last_item.transform[5]
                                    let itemY = item.transform[4]
                                    let lastItemY = last_item.transform[4]
                                    if (itemX < lastItemX)
                                        page_text += "\r\n";
                                    else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
                                        page_text += ' ';
                                } // ends if may need to add whitespace

                                page_text += item.str;
                                last_item = item;
                            } // ends for every item of text

                            textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
                            pages[pageNumber] = page_text + "\n\n";
                        } // ends if has items

                        ++self.complete;

                        callbackPageDone(self.complete, total, page);


                        // If all done, put pages in order and combine all
                        // text, then pass that to the callback
                        if (self.complete == total) {
                            // Using `setTimeout()` isn't a stable way of making sure 
                            // the process has finished. Watch out for missed pages.
                            // A future version might do this with promises.
                            setTimeout(function() {
                                let full_text = "";
                                let num_pages = Object.keys(pages).length;
                                for (let pageNum = 1; pageNum <= num_pages; pageNum++)
                                    full_text += pages[pageNum];
                                callbackAllDone(full_text);
                            }, 1000);
                        }
                    }); // ends page.getTextContent().then
                }); // ends page.then
            } // ends for every page
        });
    }; // Ends pdfToText()

    return self;
}; // Ends object factory

在终端中运行:

node myPDFfileToText.js

答案 4 :(得分:1)

2021 年 2 月更新

<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
    <script>
    
function Pdf2TextClass(){
    var self = this;
    this.complete = 0;

    this.pdfToText = function(data, callbackPageDone, callbackAllDone){
    console.assert( data  instanceof ArrayBuffer  || typeof data == 'string' );
    var loadingTask = pdfjsLib.getDocument(data);
    loadingTask.promise.then(function(pdf) {


    var total = pdf._pdfInfo.numPages;
    //callbackPageDone( 0, total );        
    var layers = {};        
    for (i = 1; i <= total; i++){
       pdf.getPage(i).then( function(page){
       var n = page.pageNumber;
       page.getTextContent().then( function(textContent){
       
       //console.log(textContent.items[0]);0
         if( null != textContent.items ){
           var page_text = "";
           var last_block = null;
           for( var k = 0; k < textContent.items.length; k++ ){
               var block = textContent.items[k];
               if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
                   if( block.x < last_block.x )
                       page_text += "\r\n"; 
                   else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
                       page_text += ' ';
               }
               page_text += block.str;
               last_block = block;
           }

           textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
           layers[n] =  page_text + "\n\n";
         }
         ++ self.complete;
         //callbackPageDone( self.complete, total );
         if (self.complete == total){
           window.setTimeout(function(){
             var full_text = "";
             var num_pages = Object.keys(layers).length;
             for( var j = 1; j <= num_pages; j++)
                 full_text += layers[j] ;
             console.log(full_text);
           }, 1000);              
         }
       }); // end  of page.getTextContent().then
     }); // end of page.then
   } // of for
 });
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
    </script>

答案 5 :(得分:0)

对于实际想要在节点服务器上使用它的所有人:

/**
 * Created by velten on 25.04.16.
 */
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');

let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);

pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
    //optionally:
    //let pdf = pdfParser.getMergedTextBlocksIfNeeded();

    let count1 = 0;
    //get text on a particular page
    for (let page of pdf.formImage.Pages) {
        count1 += page.Texts.length;
    }

    console.log(count1);
    pdfParser.destroy();
});

答案 6 :(得分:-2)

有可能但是:

  • 无论如何你都必须使用服务器,你无法在用户计算机上获取文件内容而无需将其转移到服务器并返回
  • 我还没有人写过这样的图书馆

所以如果你有空闲时间,你可以自己学习pdf格式并自己编写这样的图书馆,或者你当然可以使用服务器端库。