是否有任何UpToDate可能的方式通过Javascript将PDF / DocX转换为文本

时间:2017-03-14 07:09:35

标签: javascript docx pdf-reader pdftotext

window.onload = function() {
    var myResume = document.getElementById('myResume');
    var displayResume = document.getElementById('displayResume');

    myResume.addEventListener('change', function(e) {
        var resume = myResume.files[0];
        var textType = /text.*/;
        var imageType = /image.*/;

        if (resume.type.match(textType)) {
            var reader = new FileReader();
            reader.onload = function(e) {
                displayResume.innerText = reader.result;
            }
            reader.readAsText(resume);
        }
        else if (resume.type.match(imageType)) {
            var reader = new FileReader();
            reader.onload = function(e) {
                displayResume.innerHTML = "";
                var img = new Image();
                img.src = reader.result;
                var string = OCRAD(img);
                alert(string);
            }
            reader.readAsDataURL(resume);
        }
        else if (myResume.files[0].type === 'application/pdf') {
            var reader = new FileReader();
            reader.onload = function(e) {
                displayResume.innerHTML = "";
                var img = new Image();
                img.src = reader.result;
                Tesseract.recognize(img)
                    .progress(function  (p) { console.log('progress', p)    })
                    .then(function (result) { console.log('result', result) })
            }
            reader.readAsDataURL(resume);
        }
        else if (myResume.files[0].type === 'application/msword') {
            var reader = new FileReader();
            reader.onload = function(e) {
                displayResume.innerHTML = "";
                var img = new Image();
                img.src = reader.result;
            }
            reader.readAsDataURL(resume);
        }
        else
            displayResume.innerText = "Media type couldn't recognized.";
    });
}

$("#submitBTN").click(function() {
    if ( (myResume.files.length == 0) && (myCover.files.length == 0) )
        alert ("No files uploaded.");
});
.centralize {
    margin-left: auto;
    margin-right: auto;
    padding-top: 250px;
}
#displayResume {
  margin-top: 2em;
  width: 100%;
  overflow-x: auto;
}
.header-title {
    float: none !important;
    color: #FFF !important;
    text-align: center;
    width: 100%;
}
.navbar-header {
    width: 100%;
    text-align: center;
    padding-top: 25px;
    padding-bottom: 25px;
}
<!doctype html>
<html ng-app>
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="text/html">
    <meta name="description" content="">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0">  
    <meta name="robots" content="all,follow">
    <title>Arete Human Resources</title>

    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" crossorigin="anonymous">
	<link href="css/ionic.min.css" rel="stylesheet"/>
    <link href="css/style.css" rel="stylesheet">
    <script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.2.23/angular.min.js"></script>
</head>
<body>
    <nav class="navbar navbar-inverse navbar-fixed-top">
      <div class="container">
        <div class="navbar-header">
          <a class="navbar-brand header-title">Arete Human Resources</a>
        </div>
      </div>
    </nav>
    
    <div class="container centralize">
        <div class="row">
            <div class="col-xs-5 col-sm-5 col-md-5 col-lg-5">
                <input type="file" id="myResume" accept=".txt,.doc,.docx,.pdf,.jpg">
            </div>
            <div class="col-xs-4 col-sm-4 col-md-4 col-lg-4">
                <input type="text" ng-model="fileName" placeholder="Your name here">
            </div>
            <div class="col-xs-3 col-sm-3 col-md-3 col-lg-3">
                <h4>{{fileName}}</h4>
            </div>
        </div>
        <div class="row">
            <div class="col-xs-5 col-sm-5 col-md-5 col-lg-5">
                <input type="file" id="myCover" accept=".txt,.doc,.docx,.pdf,.jpg">
            </div>
            <div class="col-xs-4 col-sm-4 col-md-4 col-lg-4">
                <input type="text" ng-model="fileDesc" placeholder="Explanations">
            </div>
            <div class="col-xs-3 col-sm-3 col-md-3 col-lg-3">
                <h5>{{fileDesc}}</h5>
            </div>
        </div>
        <hr>
        <button type="submit" class="button button-outline button-positive" id="submitBTN" onclick="" style="float:right">Submit</button>
        <div id="displayResume"></div>
    </div>

    <script src="http://code.jquery.com/jquery-latest.min.js"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" crossorigin="anonymous"></script>
    <script src="js/ionic.min.js" type="text/javascript"></script>
    <script src="js/ionic.bundle.min.js" type="text/javascript"></script>
    <script src="js/ionic-angular.min.js" type="text/javascript"></script>
    <script src="js/ocrad.js" type="text/javascript"></script>
<!--<script src="js/require.js" type="text/javascript"></script>-->
    <script src='https://cdn.rawgit.com/naptha/tesseract.js/1.0.10/dist/tesseract.js'></script>
    <script src="js/main.js" type="text/javascript"></script>
</body>
</html>

我正在尝试从用户那里获取输入文件并尝试将文件更改为文本,然后发送任何用户发送的输入文件。我认为主要输入是文本,pdf,docx甚至图像。

我为image找到ocrad并且它完美无缺,因此pdf/docx即使我尝试使用pdfreaderpdf-to-text,它也无法以某种方式工作, composerpdf.jsdocxtemplaterstudy.js以及其他许多人。我试图像往常一样在node的终端上添加库,甚至我调试了,我找不到办法。这是codePen,下面是片段,如果有人批准现有的图书馆工作,我会很高兴。

1 个答案:

答案 0 :(得分:1)

很抱歉,如果文档中没有明确说明,但pdfreader是在node.js上运行,而不是在Web浏览器中运行。因此require问题。

以下是如何将您的PDF简历转换为文本,来自Node.js:

var pdfreader = require('pdfreader');

var rows = {}; // indexed by y-position

function printRows() {
  Object.keys(rows) // => array of y-positions (type: float)
    .sort((y1, y2) => parseFloat(y1) - parseFloat(y2)) // sort float positions
    .forEach((y) => console.log((rows[y] || []).join('')));
}

new pdfreader.PdfReader().parseFileItems('CV_ErhanYasar.pdf', function(err, item){
  if (!item || item.page) {
    // end of file, or page
    printRows();
    console.log('PAGE:', item.page);
    rows = {}; // clear rows for next page
  }
  else if (item.text) {
    // accumulate text items into rows object, per line
    (rows[item.y] = rows[item.y] || []).push(item.text);
  }
});

=&GT;然后,这就是你要得到的输出:

example cv resume parse convert pdf to text