在nodejs中读取和处理大文件(1-5 Gig)的最有效方法是什么?

时间:2015-12-07 04:14:30

标签: javascript node.js neo4j

我有一个代码,它使用fs.createreadstream逐行处理大型xml文件并将其保存到数组中。如果数组计数达到限制(例如20),我暂停createreadstream然后使用async.queue处理数组。

// code
var fs      = require('fs');
var util = require('util');
var stream = require('stream');
var es = require("event-stream");

var async   = require('async');
var request = require("request");
var neo4j = require('node-neo4j');
var exec    = require('child_process').exec;
var util = require('util');

var dbPediaExtractor = require('./dbPediaExtractor');
var dbpExt = new dbPediaExtractor();


var host = 'localhost';
var port = 7474;

var username = "neo4j";
var password = "12345";
var auth = "Basic " + new Buffer(username + ":" + password).toString("base64");
var httpUrlForTransaction = 'http://' + host + ':' + port + '/db/data/transaction/commit';

var limit = 200;
var concurrency = 200;
var action = process.argv[2];
var defLimit = process.argv[3];
var defConcurrency = process.argv[4];

var _ntfile = action.replace('.bz2','');

var queryArray = [];

function runCypherQuery(statements, callback) {

     var statements = { "statements": [ {
        "statement" : statements
      } ] };

    request.post({
        uri: httpUrlForTransaction,
        json: statements,
        headers : {
            "Authorization" : auth
        }
    },
    function (err, res, body) {
        callback(err, body);
    })
};

var carrier = function(item,cb) {

    var q = async.queue(function (task, callback) {

        dbpExt.run(task,function(err,querString){
            if(err) {
                callback();
            }else{
                queryArray.push(querString);
                callback();
            }
        });
    }, concurrency);

    q.drain = function() {
        var message = 'finished processing '+item.length+" number of queries ";
        message = message + "with concurrency of "+concurrency+" processe(s)";
        console.log(message);
        cb();
    };

    for(i in item) {
        q.push(item[i]);
    }
};


var exCallback = function(error, stdout, stderr){
    if (error !== null) {
        console.log("error: \n",error);
    } else {
        var counter = 0;
        var itemJson = [];

        rstream = fs.createReadStream(_ntfile)
        .pipe(es.split())
        .pipe(es.mapSync(function(line) {
            rstream.pause();
            line = line.toString();

            urlArrays = line.split(" ");
            var res_urls = new Array();
            if(urlArrays) {
                for(i in urlArrays){
                    var urlSearch = urlArrays[i].match(/<(.*?)>/);
                    if (urlSearch != null) { res_urls.push(urlSearch[1]); };
                }
            };

            if(itemJson.length < limit) {
                itemJson.push(res_urls[0]);
                rstream.resume();
            }else{
                console.log(itemJson);
                carrier(itemJson,function(err) {
                    if(err) {
                        console.log("reply status: ",err);
                    }

                    querString = queryArray.join(" ");
                    runCypherQuery(querString, function (err, resp) {
                        if (err) {
                            callback(err);
                        } else {
                            // resp = resp.strin
                            console.log(querString);
                            queryArray = [];
                            itemJson = [];
                            rstream.resume();
                        }
                    });
                });
            }
        })
        .on('error', function(err){
            console.log('Error while reading file.');
            console.log(err);
        })
        .on('end', function() {
            console.log('end na!!! <3');
            fs.unlink(_ntfile, function(err){
                if(err){
                    console.log('file was not deleted');
                }else{
                    console.log('uploading of files is done');
                }
            });
        })
        .on('close', function() {
             console.log(itemJson);
             carrier(itemJson,function(err) {
                if(err) {
                    console.log("reply status: ",err);
                }

                querString = queryArray.join(" ");
                runCypherQuery(querString, function (err, resp) {
                    if (err) {
                        callback(err);
                    } else {
                        // resp = resp.strin
                        console.log(querString);
                        queryArray = [];
                    }
                });
                itemJson = [];
            });
        }));
    }
};

if(typeof(action) != "undefined") {

    if(typeof(defLimit) != 'undefined') {
        limit = (!isNaN(defLimit)) ? defLimit : limit;
    }

    if(typeof(defConcurrency) != 'undefined') {
        concurrency = (!isNaN(defConcurrency)) ? defConcurrency : concurrency;
    }

    var cli_code = "bzip2 -dk " + action;
    child = exec(cli_code,exCallback);
};

0 个答案:

没有答案