Question

我想解析一个日志文件并将读取的内容POST到请求端点。我设法构建了一个解决方案，为每个日志行读取生成请求。但是，它不会产生任何背压，所以它只会惹恼服务器，我想放慢速度。

这引导我调查使用流管道，看看我是否可以将文件中的数据直接路由到request.post（）。我不能通过帖子来发布一个身体对象。

var stream = require('stream');
var request = require('request');
var liner = new stream.Transform( { objectMode: true } );

liner._transform = function (chunk, encoding, done) {
     var data = chunk.toString()
     if (this._lastLineData) data = this._lastLineData + data 

     var lines = data.split('\n') 
     this._lastLineData = lines.splice(lines.length-1,1)[0] 

     var that = this;
     lines.forEach(function(line) {
        var line_obj = JSON.parse(line);
        if( line_obj.url === "/api/usages" && line_obj.method === 'POST' ) {
            var req_body = line_obj.body.body;
            that.push.bind(req_body);
        }
     });
     done();
}

var file_name = process.argv[2];
console.log('Reading from ' + file_name);

var fs = require('fs')
var liner = require('./liner')
var source = fs.createReadStream(file_name)
source.pipe(liner).pipe(request
                            .post("http://localhost:8081/api/usages")
                            .on('response', function(response) {
                                console.log(response.statusCode) // 200
                            })
                            .on('error', function(err) {
                                console.log(err);
                            }));

转换函数中的push调用正常，但是它没有通过request.post（）中的正文发布该对象。

我错过了什么？

这是否会提供我在寻找所有文件读取完成之前调节POST调用的压力？

Answer 1

我发现您无法将流传输到HTTP请求，因为您需要事先知道Content-Length（根据规范）。不太令人愉快的选择是多部分上传 - 因为从您的转换中读取块，它们会将部分封送到接收API。这也意味着接收API需要能够接收分段上传并在收到并确认所有部分后重新组装整个文件。 AWS S3具有分段上传，它可能是一个很好的示例：http://docs.aws.amazon.com/AmazonS3/latest/dev/mpuoverview.html

我想将我的转换数据传输到我管理的另一个API，但考虑到我的文件真的不那么大，似乎不值得。如果我改变主意，我会更新这个答案：）

Answer 2

虽然我无法找到 流媒体 问题的解决方案，但我找到了一个简单的解决背压问题的方法。

我使用async.queue将工作推送到一个简单的任务队列中。

// build the send queue
var pool = new http.Agent({keepAlive: true, keepAliveMsecs: 10000, maxSockets: Math.floor(send_queue_concurrency*1.5)});
var q = async.queue(function(task, callback){
    request({
        url    : 'http://localhost:8081/xxxxxx',
        method : 'POST',
        json   : task.req_body,
        gzip   : true,
        pool   : pool,
        timeout: 30000
    }, function(error, response, body){
        if(error) {
            console.log('request error : ' + error);
            post_status.fail++;
        } else {
            if( response.statusCode === 400 ) {
                console.dir(body);
            }
        }
        callback();
    });

}, send_queue_concurrency);
q.drain = done;

send_queue_concurrency是控制请求压力的主要杠杆。

我使用文件解析例程将工作推入队列：

rl.on('line', function(line) {
    line_count++;
    try {
        var line_object = JSON.parse(line);
        var req_body = line_object.body.body;
        q.push({req_body:req_body, line_object:line_object}, function(err){
            if (err){
                console.log('queue error! '+JSON.stringify(err));
            }
        });
    } catch( e ) {
        console.dir(e);
    }
});

var done = function() {
    // print out some reporting stats...
    // console.log('xxxxxx');
    console.log('\ndone.');
    process.exit(0);    
};

管道将读取流转换为request.post（）

2 个答案: