我有一个代码,它使用fs.createreadstream逐行处理大型xml文件并将其保存到数组中。如果数组计数达到限制(例如20),我暂停createreadstream然后使用async.queue处理数组。
// code
var fs = require('fs');
var util = require('util');
var stream = require('stream');
var es = require("event-stream");
var async = require('async');
var request = require("request");
var neo4j = require('node-neo4j');
var exec = require('child_process').exec;
var util = require('util');
var dbPediaExtractor = require('./dbPediaExtractor');
var dbpExt = new dbPediaExtractor();
var host = 'localhost';
var port = 7474;
var username = "neo4j";
var password = "12345";
var auth = "Basic " + new Buffer(username + ":" + password).toString("base64");
var httpUrlForTransaction = 'http://' + host + ':' + port + '/db/data/transaction/commit';
var limit = 200;
var concurrency = 200;
var action = process.argv[2];
var defLimit = process.argv[3];
var defConcurrency = process.argv[4];
var _ntfile = action.replace('.bz2','');
var queryArray = [];
function runCypherQuery(statements, callback) {
var statements = { "statements": [ {
"statement" : statements
} ] };
request.post({
uri: httpUrlForTransaction,
json: statements,
headers : {
"Authorization" : auth
}
},
function (err, res, body) {
callback(err, body);
})
};
var carrier = function(item,cb) {
var q = async.queue(function (task, callback) {
dbpExt.run(task,function(err,querString){
if(err) {
callback();
}else{
queryArray.push(querString);
callback();
}
});
}, concurrency);
q.drain = function() {
var message = 'finished processing '+item.length+" number of queries ";
message = message + "with concurrency of "+concurrency+" processe(s)";
console.log(message);
cb();
};
for(i in item) {
q.push(item[i]);
}
};
var exCallback = function(error, stdout, stderr){
if (error !== null) {
console.log("error: \n",error);
} else {
var counter = 0;
var itemJson = [];
rstream = fs.createReadStream(_ntfile)
.pipe(es.split())
.pipe(es.mapSync(function(line) {
rstream.pause();
line = line.toString();
urlArrays = line.split(" ");
var res_urls = new Array();
if(urlArrays) {
for(i in urlArrays){
var urlSearch = urlArrays[i].match(/<(.*?)>/);
if (urlSearch != null) { res_urls.push(urlSearch[1]); };
}
};
if(itemJson.length < limit) {
itemJson.push(res_urls[0]);
rstream.resume();
}else{
console.log(itemJson);
carrier(itemJson,function(err) {
if(err) {
console.log("reply status: ",err);
}
querString = queryArray.join(" ");
runCypherQuery(querString, function (err, resp) {
if (err) {
callback(err);
} else {
// resp = resp.strin
console.log(querString);
queryArray = [];
itemJson = [];
rstream.resume();
}
});
});
}
})
.on('error', function(err){
console.log('Error while reading file.');
console.log(err);
})
.on('end', function() {
console.log('end na!!! <3');
fs.unlink(_ntfile, function(err){
if(err){
console.log('file was not deleted');
}else{
console.log('uploading of files is done');
}
});
})
.on('close', function() {
console.log(itemJson);
carrier(itemJson,function(err) {
if(err) {
console.log("reply status: ",err);
}
querString = queryArray.join(" ");
runCypherQuery(querString, function (err, resp) {
if (err) {
callback(err);
} else {
// resp = resp.strin
console.log(querString);
queryArray = [];
}
});
itemJson = [];
});
}));
}
};
if(typeof(action) != "undefined") {
if(typeof(defLimit) != 'undefined') {
limit = (!isNaN(defLimit)) ? defLimit : limit;
}
if(typeof(defConcurrency) != 'undefined') {
concurrency = (!isNaN(defConcurrency)) ? defConcurrency : concurrency;
}
var cli_code = "bzip2 -dk " + action;
child = exec(cli_code,exCallback);
};