所以我在下面的代码中使用了自定义流,该流扩展了该流。从节点核心库进行转换。这段代码的目的是逐行读取文件,每行都是一个以JSON格式格式化的字符串,并进行一些处理以检查电子邮件是否为新邮件。
"use strict";
const fs = require('fs');
const split = require('split');
const request = require('request');
const parseString = require('xml2js').parseString;
const moment = require('moment');
const LimitedParallelStream = require('./limitedParallelStream');
const ApiKey = 'xxxxxxxxxxxxxxxxxxxxx';
const ListID = 'yyyyyyyyyyyyyyyyyyyy';
const sourceReadStream = fs.createReadStream(process.argv[2]);
const resultWriteStream = fs.createWriteStream('results2.txt');
let newEmailCount = 0;
let newEmailByDay = {};
sourceReadStream
.pipe(split())
.pipe(new LimitedParallelStream(3, (entrant, enc, push, done) => {
if(!entrant) return done();
const entrantObj = JSON.parse(entrant);
const { email, entrydttm } = entrantObj;
const entryDayKey = moment(entrydttm).format('YYYY-MM-DD');
console.log(`checking ${email}...`);
const options = {
method: 'GET',
url: 'http://api.com/api/Subscribers.GetSingleSubscriber',
qs:
{
ApiKey,
ListID,
EmailAddress: email
}
};
request(options, function (err, response, body) {
if (err) throw new Error(err);
parseString(body, function (parseErr, result) {
const createdDate = moment(result.date);
let newEmail = 'NO';
if (createdDate.isSameOrAfter(entrydttm)) {
newEmailCount++;
newEmail = 'YES';
if (newEmailByDay.hasOwnProperty(entryDayKey)) {
const { [entryDayKey]: count } = newEmailByDay;
newEmailByDay = { ...newEmailByDay, [entryDayKey]: count + 1 };
} else {
newEmailByDay = { ...newEmailByDay, [entryDayKey]: 1 };
}
}
push(`${email} - entrydttm: ${entrydttm}, createdttm: ${result.anyType.Date[0]}, new: ${newEmail}\n`);
done();
});
});
}))
.pipe(resultWriteStream) // { end: false }
.on('finish', () => {
console.log(`All emails were checked - total new emails: ${newEmailCount}`);
console.log(`New emails by day: ${JSON.stringify(newEmailByDay, null, 2)}`);
})
;
如果我只是在'finish'回调中控制台记录最终结果,则此方法非常好。
但是,当我尝试在连接resultWriteStream的管道上使用{end:false},然后在读取流结束时手动结束写入流时,读取流似乎在读取所有内容之前就结束了。
像这样的东西:
sourceReadStream
.on('end', () => resultWriteStream.end(`All emails were checked - total new emails: ${newEmailCount}`));
LimitedParallelStream.js如下:
"use strict";
const stream = require('stream');
class LimitedParallelStream extends stream.Transform {
constructor(concurrency, userTransform) {
super({objectMode: true});
this.concurrency = concurrency;
this.userTransform = userTransform;
this.running = 0;
this.terminateCallback = null;
this.continueCallback = null;
}
_transform(chunk, enc, done) {
this.running++;
this.userTransform(chunk, enc, this.push.bind(this), this._onComplete.bind(this));
if(this.running < this.concurrency) {
done();
} else {
this.continueCallback = done;
}
}
_flush(done) {
if(this.running > 0) {
this.terminateCallback = done;
} else {
done();
}
}
_onComplete(err) {
this.running--;
if(err) {
return this.emit('error', err);
}
const tmpCallback = this.continueCallback;
this.continueCallback = null;
tmpCallback && tmpCallback();
if(this.running === 0) {
this.terminateCallback && this.terminateCallback();
}
}
}
module.exports = LimitedParallelStream;
要读取的文件是这样的,但是有几百行:
{“电子邮件”:“ joe.doe@hotmail.com”,“ entrydttm”:“ 2019-01-16 14:09:07”} {“ email”:“ bill.gee@gmail.com”,“ entrydttm”:“ 2019-01-16 13:53:17”}
到目前为止,我还没有弄清楚在实际处理完所有行之后将最终结果写入同一文件的正确方法是什么。
任何帮助将不胜感激!