我试图读取一个大文件,做一些计算,然后写入一个更大的文件。为了防止过多的内存消耗,我正在使用流。我面临的问题是写入流没有触发“漏极”事件,这表示写入已刷新到磁盘。为了防止“背压”,我在等待再次开始写入缓冲区之前触发排水事件。在调试时我发现在.write()调用返回false并且执行了行fvfileStream.once('drain',test)后,程序就会停止并且什么都不做。
以下是代码:
var fs = require('fs');
//a test function I created to see if the callback is called after drain.
var test = function(){
console.log("Done Draining");
}
fs.readFile('/another/file/to/be/read', {
encoding: "utf8"
}, function(err, data) {
if (err) throw err;
//Make an array containing tags.
var tags = data.split('\n');
//create a write stream.
var fvfileStream = fs.createWriteStream('/path/TagFeatureVectors.csv');
//read in the question posts
var qfileStream = fs.createReadStream('/Big/file/QuestionsWithTags.csv', {
encoding: "utf8"
});
var partialRow = null;
var writable = true;
var count = 0;
var doRead = function() {
var qData = qfileStream.read();
var questions = qData.split('\n');
if (partialRow != null) {
questions[0] = partialRow + questions[0];
partialRow = null;
}
var lastRow = questions[questions.length - 1];
if (lastRow.charAt(lastRow.length - 1) != '\n') {
partialRow = lastRow;
}
questions.forEach(function(row, index, array) {
count++;
var fields = row.split(',');
console.log("Processing question number: " + count + " id: " + fields[0]);
var tagString = fields[1];
var regex = new RegExp(/<([^>]+)>/g);
tags.forEach(function(tag, index, array) {
var found = false;
var questionTags;
while ((questionTags = regex.exec(tagString)) != null) {
var currentTag = questionTags[1]
if (currentTag === tag) {
found = true;
break;
}
};
//This is where the writestream is written to
if (found) {
writable = fvfileStream.write("1,", "utf8");
}else {
writable = fvfileStream.write("0,","utf8");
}
});
});
fvfileStream.write("\n");
}
qfileStream.on('readable', function() {
if (writable) {
doRead();
} else {
//Waiting for drain event.
fvfileStream.once('drain', test);
}
});
qfileStream.on('end', function() {
fvfileStream.end();
});
});
根据@loganfsmyth提供的建议,我实现了转换流,但仍然遇到了同样的问题。这是我更新的代码:
var fs = require('fs');
var stream = require('stream');
var util = require('util');
var Transform = stream.Transform;
function FVCreator(options) {
// allow use without new
if (!(this instanceof FVCreator)) {
return new FVCreator(options);
}
// init Transform
Transform.call(this, options);
}
util.inherits(FVCreator, Transform);
var partialRow = null;
var count = 0;
var tags;
FVCreator.prototype._transform = function(chunk, enc, cb) {
var that = this;
var questions = chunk.toString().split('\n');
if (partialRow != null) {
questions[0] = partialRow + questions[0];
partialRow = null;
}
var lastRow = questions[questions.length - 1];
if (lastRow.charAt(lastRow.length - 1) != '\n') {
partialRow = lastRow;
questions.splice(questions.length - 1, 1);
}
questions.forEach(function(row, index, array) {
count++;
var fields = row.split(',');
console.log("Processing question number: " + count + " id: " + fields[0]);
var tagString = fields[1];
var regex = new RegExp(/<([^>]+)>/g);
tags.forEach(function(tag, index, array) {
var found = false;
var questionTags;
while ((questionTags = regex.exec(tagString)) != null) {
var currentTag = questionTags[1]
if (currentTag === tag) {
found = true;
break;
}
};
if (found) {
that.push("1,", "utf8");
} else {
that.push("0,", "utf8");
}
});
});
this.push("\n", "utf8");
cb();
};
fs.readFile('/another/file/to/be/read', {
encoding: "utf8"
}, function(err, data) {
if (err) throw err;
//Make an array containing tags.
tags = data.split('\n');
//write to a file.
var fvfileStream = fs.createWriteStream('/path/TagFeatureVectors.csv');
//read in the question posts
var qfileStream = fs.createReadStream('/large/file/to/be/read', {
encoding: "utf8"
});
var fvc = new FVCreator();
qfileStream.pipe(fvc).pipe(fvfileStream);
});
我在OSX Yosemite上运行它。