修改数据后的nodejs管道流

时间:2015-11-09 06:30:03

标签: json node.js stream

我正在学习使用nodejs进行流式传输,我理解request npm module中显示的示例;

request(url).pipe(fs.createWriteStream('./filename.json'))

但我的问题有两部分。

案例1:

function fetchSitemaps() {
    return requestAsync(url).then(data => {
        const $ = cheerio.load(data);
        let urls = [];
        $("loc").each((i, e) => urls.push($(e).text()));
        fs.writeFileSync('./sitemaps.json', JSON.stringify(urls))
    })
}

我想将上述内容从writeFileSync转换为createWriteStream,但如何继续将数据附加到JSON格式的数组中?

案例2:

function fetchLyricUrls() {
    let sitemaps = JSON.parse(fs.readFileSync('./sitemaps.json'));

    sitemaps.forEach((sitemap, i) => {
        let fileName = i + '.json';

        if(url_pat.exec(sitemap)) {
            fileName = url_pat.exec(sitemap)[1] + '.json';
        }

        requestAsync(url).then(data => {
            const $ = cheerio.load(data);
            let urls = [];
            $("loc").each((i, e) => urls.push($(e).text()));    
            return urls;
        }).then(urls => {
            let allUrls = [];
            urls.map(u => {
                return requestAsync(u).then(sm => {
                    const $ = cheerio.load(sm);
                    $("loc").each((i, e) => allUrls.push($(e).text()))
                    fs.writeFileSync('./lyrics.json', JSON.stringify(allUrls))
                    return allUrls;
                });
            });
        });
    });
}

问题的第一部分是相同的,使用writeStream附加到json数据,但这次,我想解析html数据并获取一些文本,我想用流发送,而不是html数据一个整体。

1 个答案:

答案 0 :(得分:0)

所以让我们分开答案

案例1

首先,我会尝试将数据保存为流,并尽量不要累积它。所以本质上,我不是加载整个站点地图然后解析它,而是使用类似xml-nodes的东西,以便节点是一个单独的流。然后我的模块scramjet将转变

    const request = require('request');
    const xmlNodes = require('xml-nodes');
    const writable = fs.createWritableStream('./sitemaps.json');
    const cheerio = require('cheerio');
    const scramjet = require('scramjet');

    writable.write('[');

    let first = 0;
    request('http://example.com/sitemap.xml')
            // this fetches your sitemap
        .on('end', () => writable.end("]"))
            // when the stream ends, this will end the sitemaps.json
        .pipe(xmlNodes('loc'))
            // this extracts your "loc" nodes
        .pipe(new scramjet.DataStream())
            // this creates a mappable stream
        .map((nodeString) => cheerio('loc', nodeString).text())
            // this extracts the text as in your question
        .map((url) => (first++ ? ',' : '') + JSON.stringify(url))
            // this makes sure that strings are nicely escaped
            // and prepends them with a comma on every node, but first one
        .pipe(writable, {end: false})
            // and this will push all your entries to the writable stream

案例2

在这里你需要做类似的事情,虽然如果案例1是一个直接的步骤,那么我建议将文件存储在JSON行中,而不是数组。通过这种方式传输会更容易。