Question

我正在尝试启动一个集群，该集群将从谷歌云存储流式传输文件（换行新行的JSON），并在从MongoDB获取数据后转换每一行。转换行后，我想将它存储在Google的bigquery中 - 一次10000行。所有这一切都运行正常，但问题是流处理文件的处理速率会随着时间的推移而显着下降。

我在一台服务器上设置了节点应用程序，在另一台服务器上设置了mongodb。 8核心机器均配备30GB RAM。执行脚本时，最初应用程序服务器和mongodb服务器的CPU使用率约为70％-75％。 30分钟后，CPU使用率降至10％，最后降至1％。应用程序不会生成异常。我可以看到应用程序日志，发现它已经完成了处理几个文件并占用了新文件进行处理。一次执行可以在晚于下午3:00和几乎到下午5:20之后观察到。

var cluster = require('cluster'),
    os = require('os'),
    numCPUs = os.cpus().length,
    async = require('async'),
    fs = require('fs'),
    google = require('googleapis'),
    bigqueryV2 = google.bigquery('v2'),
    gcs = require('@google-cloud/storage')({
        projectId: 'someproject',
        keyFilename: __dirname + '/auth.json'
    }),
    dataset = bigquery.dataset('somedataset'),
    bucket = gcs.bucket('somebucket.appspot.com'),
    JSONStream = require('JSONStream'),
    Transform = require('stream').Transform,
    MongoClient = require('mongodb').MongoClient,
    mongoUrl = 'mongodb://localhost:27017/bigquery',
    mDb,
    groupA,
    groupB;

var rows = [],
    rowsLen = 0;

function transformer() {

    var t = new Transform({ objectMode: true });

    t._transform = function(row, encoding, cb) {
        // Get some information from mongodb and attach it to the row
        if (row) {
            groupA.findOne({
                'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
            }, {
                fields: { 'properties.OA_SA': 1, _id: 0 }
            }, function(err, a) {
                if (err) return cb();
                groupB.findOne({
                    'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
                }, {
                    fields: { 'properties.WZ11CD': 1, _id: 0 }
                }, function(err, b) {
                    if (err) return cb();
                    row.groupA = a ? a.properties.OA_SA : null;
                    row.groupB = b ? b.properties.WZ11CD : null;

                    // cache processed rows in memory
                    rows[rowsLen++] = { json: row };

                    if (rowsLen >= 10000) {
                        // batch insert rows in bigquery table
                        // and free memory
                        log('inserting 10000')
                        insertRowsAsStream(rows.splice(0, 10000));
                        rowsLen = rows.length;
                    }

                    cb();
                });
            });
        } else {
            cb();
        }
    };

    return t;
}

var log = function(str) {
    console.log(str);
}

function insertRowsAsStream(rows, callback) {
    bigqueryV2.tabledata.insertAll({
        "projectId": 'someproject',
        "datasetId": 'somedataset',
        "tableId": 'sometable',
        "resource": {
            "kind": "bigquery#tableDataInsertAllRequest",
            "rows": rows
        }
    }, function(err, res) {
        if (res && res.insertErrors && res.insertErrors.length) {
            console.log(res.insertErrors[0].errors)
            err = err || new Error(JSON.stringify(res.insertErrors));
        }
    });
}


function startStream(fileName, cb) {
    // stream a file from Google cloud storage
    var file = bucket.file(fileName),
        called = false;

    log(`Processing file ${fileName}`);

    file.createReadStream()
        .on('data', noop)
        .on('end', function() {
            if (!called) {
                called = true;
                cb();
            }
        })
        .pipe(JSONStream.parse())
        .pipe(transformer())
        .on('finish', function() {
            log('transformation ended');
            if (!called) {
                called = true;
                cb();
            }
        });
}

function processFiles(files, cpuIdentifier) {
    if (files.length == 0) return;
    var fn = [];

    for (var i = 0; i < files.length; i++) {
        fn.push(function(cb) {
            startStream(files.pop(), cb);
        });
    }

    // process 3 files in parallel
    async.parallelLimit(fn, 3, function() {
        log(`child process ${cpuIdentifier} completed the task`);
        fs.appendFile(__dirname + '/complete_count.txt', '1');
    });
}

if (cluster.isMaster) {
    for (var ii = 0; ii < numCPUs; ii++) {
        cluster.fork();
    }
} else {
    MongoClient.connect(mongoUrl, function(err, db) {
        if (err) throw (err);
        mDb = db;
        groupA = mDb.collection('groupageo');
        groupB = mDb.collection('groupbgeo');
        processFiles(files, process.pid);
        // `files` is an array of file names
        // each file is in newline json delimited format
        // ["1478854974993/000000000000.json","1478854974993/000000000001.json","1478854974993/000000000002.json","1478854974993/000000000003.json","1478854974993/000000000004.json","1478854974993/000000000005.json"]
    });
}

Answer 1

好的，我找到了罪魁祸首！ Google API Node.js客户端库使用名为“stream-events”的模块来实现Streams 0.8。 Streams 0.8不会根据消费者使用数据的能力来控制它发出“数据”事件的速率。速率控制功能在Streams 1.0中引入。所以这实际上意味着可读流以无法处理的速率向MongoDB投放数据。

解决方案：我使用了'request'模块而不是Google的客户端库。我向请求模块提供了一个签名的URL，后者又将结果作为流传输到我的变换器中。

带走：请务必检查用于其所用流版本的模块。

Node.js可读流随着时间的推移而减慢，CPU使用率下降

1 个答案: