如何将PDF发送到Watson的文档转换服务而不先将其写入磁盘?

时间:2016-05-23 16:48:34

标签: ibm-cloud ibm-watson document-conversion

我正在尝试使用 watson-developer-cloud node.js库将this document (http://www.redbooks.ibm.com/redbooks/pdfs/ga195486.pdf)转换为Watson文档转换服务中的答案单元。

在实际程序(不是这个测试程序)中,我正在检索文档并即时转换它,而不是先将其写入磁盘。我之前已经使用过其他文档,但最新版本的库(v 1.7.0 )似乎已经改变,它不再像我使用它那样工作。但即使在我开始使用最新版本之前,此特定文档也无法转换。

我正在使用的带注释的测试代码如下。我已经尝试了几种方法来实现这一点,其中的变体都在下面的 var opts = { 下进行了注释。您必须一次取消注释其中一个以查看结果。

'use strict';
var bluemix = require('./bluemix');
var extend=require('util')._extend;
var fs=require('fs');
var watson=require('watson-developer-cloud');
var streams = require('memory-streams');

var dcCredentials =  extend({
  url: '<url>',
  version: 'v1',
  username: '<username>',
  password: '<password>'
}, bluemix.getServiceCreds('document_conversion')); // VCAP_SERVICES
var document_conversion = watson.document_conversion(dcCredentials);

var bookpdf=getBook('ga195486.pdf'); 
convert(bookpdf);

function getBook(filename)
   {
   var bl=fs.readFileSync(filename,'utf8');
   return bl;
   }

function convert(content)
   {
   var opts={ //uncomment ONE of these
//      file: new Buffer(content), //See message #1 below
//      file: {value: new Buffer(content), options: {}}, //see message #2 below
//      file: {value: new Buffer(content), options: {contentType: "application/pdf"}}, //This used to work. See message #2 (again) below
//      file: new streams.ReadableStream(content),//see message #3 below
      conversion_target: "ANSWER_UNITS",
      content_type:'application/pdf'
      };
   document_conversion.convert(opts,
      function (err, response) 
         {
         if (err) 
            {
            console.log("Error converting doc: ", err);
            }
         else if (response.answer_units.length==0)
            {
            var msg="No answer units";
            console.log(msg,response);
            }
         else 
            {
            console.log('Works!');
            console.dir(response);
            }
         }
      );
   }

//Message #1: This returns: 
//  No answer units { source_document_id: '',
//  timestamp: '2016-05-23T16:18:23.825Z',
//  media_type_detected: 'application/pdf',
//  metadata: [],
//  answer_units: [],
//  warnings: 
//   [ { phase: 'pdf',
//       warning_id: 'empty_input_to_converter',
//       description: 'The input provided to the converter phase is empty or doesn\'t contain text that can be converted.' },
//     { phase: 'normalized_html',
//       warning_id: 'empty_input_to_converter',
//       description: 'The input HTML document has no body content.' },
//     { phase: 'answer_units',
//       warning_id: 'empty_input_to_converter',
//       description: 'The input provided to the converter phase is empty or doesn\'t contain text that can be converted.' } ] }


//Message #2: These return:
///home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/node_modules/request/node_modules/combined-stream/node_modules/delayed-stream/lib/delayed_stream.js:33
//  source.on('error', function() {});
//
//TypeError: source.on is not a function
//    at Function.DelayedStream.create (/home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/node_modules/request/node_modules/combined-stream/node_modules/delayed-stream/lib/delayed_stream.js:33:10)
//    at FormData.CombinedStream.append (/home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/node_modules/request/node_modules/combined-stream/lib/combined_stream.js:43:37)
//    at FormData.append (/home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/node_modules/request/node_modules/form-data/lib/form_data.js:68:3)
//    at appendFormValue (/home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/node_modules/request/request.js:339:21)
//    at Request.init (/home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/node_modules/request/request.js:352:11)
//    at new Request (/home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/node_modules/request/request.js:142:8)
//    at request (/home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/node_modules/request/index.js:55:10)
//    at createRequest (/home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/lib/requestwrapper.js:134:10)
//    at DocumentConversion.convert (/home/david/git/ccb-contentbridge/node_modules/watson-developer-cloud/services/document_conversion/v1.js:134:10)
//    at convert (/home/david/git/ccb-contentbridge/testRedbooks.js:35:24)


//Message #3: This returns and then it hangs there:
//Error converting doc:  { code: 400, error: 'Error in the web application' }

有人可以告诉我我做错了什么吗?

2 个答案:

答案 0 :(得分:1)

该特定文件大于文档转换服务当前可以处理的文件。不幸的是,我没有很好的信息确切地知道现在的限制是什么,但是团队已经意识到这一点并且正在寻求改进。

如果您可以提供一个先前有效但已与node.js库的v1.7.0打破的示例,我将会看一下,希望能够提供更好的信息。

哦,在'utf8'电话上指定fs.readfileSync()可能会导致您遇到的一些麻烦。

答案 1 :(得分:1)

看起来Doc Con的限制为50 MB per this,而且我们的文件小于此...必须是其他一些问题。