我正在学习使用nodejs进行流式传输,我理解request npm module中显示的示例;
request(url).pipe(fs.createWriteStream('./filename.json'))
但我的问题有两部分。
案例1:
function fetchSitemaps() {
return requestAsync(url).then(data => {
const $ = cheerio.load(data);
let urls = [];
$("loc").each((i, e) => urls.push($(e).text()));
fs.writeFileSync('./sitemaps.json', JSON.stringify(urls))
})
}
我想将上述内容从writeFileSync
转换为createWriteStream
,但如何继续将数据附加到JSON格式的数组中?
案例2:
function fetchLyricUrls() {
let sitemaps = JSON.parse(fs.readFileSync('./sitemaps.json'));
sitemaps.forEach((sitemap, i) => {
let fileName = i + '.json';
if(url_pat.exec(sitemap)) {
fileName = url_pat.exec(sitemap)[1] + '.json';
}
requestAsync(url).then(data => {
const $ = cheerio.load(data);
let urls = [];
$("loc").each((i, e) => urls.push($(e).text()));
return urls;
}).then(urls => {
let allUrls = [];
urls.map(u => {
return requestAsync(u).then(sm => {
const $ = cheerio.load(sm);
$("loc").each((i, e) => allUrls.push($(e).text()))
fs.writeFileSync('./lyrics.json', JSON.stringify(allUrls))
return allUrls;
});
});
});
});
}
问题的第一部分是相同的,使用writeStream附加到json数据,但这次,我想解析html数据并获取一些文本,我想用流发送,而不是html数据一个整体。
答案 0 :(得分:0)
所以让我们分开答案
首先,我会尝试将数据保存为流,并尽量不要累积它。所以本质上,我不是加载整个站点地图然后解析它,而是使用类似xml-nodes的东西,以便节点是一个单独的流。然后我的模块scramjet将转变
const request = require('request');
const xmlNodes = require('xml-nodes');
const writable = fs.createWritableStream('./sitemaps.json');
const cheerio = require('cheerio');
const scramjet = require('scramjet');
writable.write('[');
let first = 0;
request('http://example.com/sitemap.xml')
// this fetches your sitemap
.on('end', () => writable.end("]"))
// when the stream ends, this will end the sitemaps.json
.pipe(xmlNodes('loc'))
// this extracts your "loc" nodes
.pipe(new scramjet.DataStream())
// this creates a mappable stream
.map((nodeString) => cheerio('loc', nodeString).text())
// this extracts the text as in your question
.map((url) => (first++ ? ',' : '') + JSON.stringify(url))
// this makes sure that strings are nicely escaped
// and prepends them with a comma on every node, but first one
.pipe(writable, {end: false})
// and this will push all your entries to the writable stream
在这里你需要做类似的事情,虽然如果案例1是一个直接的步骤,那么我建议将文件存储在JSON行中,而不是数组。通过这种方式传输会更容易。