我已经使用了'pdf.js-extract' npm模块从pdf中获取数据。
var PDFExtract = require('pdf.js-extract').PDFExtract;
var pdfExtract = new PDFExtract();
var filename="/home/aman/Downloads/sample_invoice.pdf"
pdfExtract.extract(filename , function (err, data) {
if (err) return console.log(err);
console.log(JSON.stringify(data));
});
但是我没有得到想要的结果。 我想从发票pdf中获取相关信息,例如税金,已付款总额,卖家地址,并将获取的数据保存到mongodb集合中
答案 0 :(得分:0)
您必须按发票格式(fn company1,fn company2 ...)编写函数。
下面是一个具有三种不同功能的示例,用于在pdf.js-extract
模块的导出中检索数据:
// Sample invoice
let sampleInvoice =
{
"pages":
[
{
"content":
[
{
"x": 348.41,
"y": 125.59899999999993,
"str": "Invoice Number",
"dir": "ltr",
"width": 61.61760000000001,
"height": 8.8,
"fontName": "g_d0_f2"
},
{
"x": 451.935,
"y": 125.59899999999993,
"str": "INV-3337",
"dir": "ltr",
"width": 37.171200000000006,
"height": 8.8,
"fontName": "g_d0_f2"
}
]
}
]
};
// Create alerts for test functions in browser
alert(searchByPosition(sampleInvoice.pages, 450, 125));
alert(searchByPrev(sampleInvoice.pages, 'Invoice Number'));
alert(searchByFormat(sampleInvoice.pages, /INV-\d+$/));
function searchByPosition(pages,x,y)
{
// Set position range (difference max)
let range = 10;
// Init x and y positions
x = Math.floor(x/range), y = Math.floor(y/range);
// Loop in all pages
for(let i = 0; i < pages.length; i++)
// Loop in all content
for(let j = 0; j < pages[i].content.length; j++)
// Test position x and y and if match return content
if(Math.floor(pages[i].content[j].x/range) == x && Math.floor(pages[i].content[j].y/range) == y)
// Return result
return pages[i].content[j].str;
// No results found
return 'NotFound';
}
function searchByPrev(pages,txt)
{
// Init txt
txt = txt.toLowerCase();
// Loop in all pages
for(let i = 0; i < pages.length; i++)
// Loop in all content
for(let j = 0; j < pages[i].content.length; j++)
// Test text and if match return next content
// (If you write j-1, you can have searchByNext function)
if(pages[i].content[j].str.toLowerCase() == txt && pages[i].content[j+1])
// Return result
return pages[i].content[j+1].str;
// No results found
return 'NotFound';
}
function searchByFormat(pages,regex)
{
// Loop in all pages
for(let i = 0; i < pages.length; i++)
// Loop in all content
for(let j = 0; j < pages[i].content.length; j++)
// Test regex and if match return content
if(regex.test(pages[i].content[j].str))
// Return result
return pages[i].content[j].str;
// No results found
return 'NotFound';
}
答案 1 :(得分:0)
请参考 pdf.js-extract npm模块的GitHub存储库 https://github.com/ffalt/pdf.js-extract
以下文件在github链接的example / example.js路径中给出
var fs = require('fs');
var PDFExtract = require('../lib').PDFExtract;
var pdfExtract = new PDFExtract();
pdfExtract.extract('./example.pdf', {} /* options*/, function (err, data) {
if (err) return console.log(err);
fs.writeFileSync('./example-output.json', JSON.stringify(data, null, '\t'));
var lines = PDFExtract.utils.pageToLines(data.pages[0], 2);
var rows = PDFExtract.utils.extractTextRows(lines);
var text = rows.map(function (row) {
return row.join('');
}).join('\n');
fs.writeFileSync('./example-output.txt', text);
console.log(JSON.stringify(data, null, '\t'));
});
希望它对您有用
答案 2 :(得分:0)
使用pdf-extract npm软件包(https://www.npmjs.com/package/pdf-extract),可以从pdf中提取文本。
// Extract text from PDF files (with images)
// Installation guide: https://github.com/nisaacson/pdf-extract
var extract = (function() {
'use strict';
var fs = require('fs');
var path = require('path');
var pdfExtract = require('pdf-extract');
var defaultOptions = {
type: 'ocr',
ocr_flags: [
'-l eng',
]
};
// Execute script if not used as a module
if (!module.parent) {
init(process.argv[2]);
}
function init(filePath, options, callback) {
callback = callback || function (error, response) {
if (error) { return console.error(error); }
return console.log(response);
};
options = options || defaultOptions;
if (!filePath) {
return callback(new Error('No input file (PDF) specified.'));
}
processFile(filePath, ocrLanguage, callback);
}
function processFile(filePath, ocrLanguage, callback) {
var processor = pdfExtract(filePath, options, function (error) {
if (error) {
callback(error);
}
});
processor.on('complete', function (data) {
saveFile(filePath + '.txt', data.text_pages, callback);
});
processor.on('error', function (error) {
callback(error);
});
}
function saveFile(filePath, string, callback) {
// Normalize file path
filePath = path.normalize(filePath);
try {
callback('Saved file ' + filePath);
// Save file
return fs.writeFileSync(filePath, string, 'utf8');
} catch (error) {
callback(error);
}
}
module.exports = {
init: init
};
}());
答案 3 :(得分:0)
文件 readPdf.js
const readPdf = (file) => new Promise((resolve, reject) => {
try {
pdfExtract.extract( file, function (error, text) {
(error) ? reject(new Error('El archivo no se pudo leer')) : resolve(text)
return text;
});
// Set up the timeout
setTimeout(function () {
reject('Promise timed out after ' + 10000 + ' ms');
}, 10000);
return data;
} catch (error) {
return false;
}
});
module.exports = { readPdf };
文件xxx.js
var {readPdf}= require('readPdf');
readPdf(files)
.then(response => {
console.log(response) // this is your data is
}).catch(err => console.log(err));
return response;
});