同时读取和匹配JavaScript中的多个巨大JSON / NDJSON文件

时间:2019-06-07 02:52:35

标签: javascript node.js json big-o

有两个文件都按ID排序,parent_data.json和child_data.json。孩子是父母的一部分。

我正在尝试将parent.id与child.id匹配。我正在尝试对每个.json文件进行一次通过。而不是像嵌套的forEach / map这样循环遍历每个孩子的事情。

假设两个.json文件都包含少量演出/数百万行,重新读取children.json将花费很长时间,需要多次传递,并且将所有内容加载到内存中也是不可能的。

每个文件可以是JSON格式,也可以是NDJSON。当前示例是NDJSON。

parent_data.json:

{"id":1, "name": "parent_1"}
{"id":2, "name": "parent_2"}
{"id":3, "name": "parent_3"}
{"id":4, "name": "parent_4"}
{"id":5, "name": "parent_5"}
{"id":6, "name": "parent_6"}
{"id":7, "name": "parent_7"}

child_data.json:

{"id":4, "name":"belongs_to_parent_4", "guid": "${unique_id}"}
{"id":4, "name":"belongs_to_parent_4", "guid": "${unique_id}"}
{"id":5, "name":"belongs_to_parent_5", "guid": "${unique_id}"}
{"id":7, "name":"belongs_to_parent_7", "guid": "${unique_id}"}
{"id":7, "name":"belongs_to_parent_7", "guid": "${unique_id}"}

预期结果:

[
    {
        "id": 4,
        "name": "parent_4",
        "children": [
            {
                "id": 4,
                "name": "belongs_to_parent_4",
                "guid": "${unique_id}"
            },
            {
                "id": 4,
                "name": "belongs_to_parent_4",
                "guid": "${unique_id}"
            }
        ]
    },
    {
        "id": 5,
        "name": "parent_5",
        "children": [
            {
                "id": 5,
                "name": "belongs_to_parent_5",
                "guid": "${unique_id}"
            }
        ]
    },
    {
        "id": 7,
        "name": "parent_7",
        "children": [
            {
                "id": 7,
                "name": "belongs_to_parent_7",
                "guid": "${unique_id}"
            }
        ]
    }
]

我找不到一种合适的方法来遍历任何一条流,而不会在最终结果中丢失一些孩子。

此代码错过了每个父母的两个第一个孩子:

const fs = require('fs');
const es = require('event-stream');

const main = () => {
    let currentParent = null;
    let currentChild = null;
    let prevChild = null;

    let parentLines = 0;
    let childLines = 0;

    let tmpChildren = [];
    const parentStream = fs
        .createReadStream('parent_data.json') // Parent Stream
        .pipe(es.split('\n')) // Delimit by \n
        .pipe(
            es.mapSync(line => { // Current line, without the delimiter \n
                parentStream.pause(); // Pause stream until done processing this line
                ++parentLines; // Debug
                currentParent = JSON.parse(line); // Now it's valid JSON object
                console.log('currentParent', currentParent.id); // Debug
            }),
        )
        .on('error', e => {
            console.error(e);
        })
        .on('close', () => {
            console.log('close reading parent stream');
        })
        .on('end', () => {
            console.log('end reading parent stream');
        });

    const childStream = fs
        .createReadStream('child_data.json') // Child Stream
        .pipe(es.split('\n')) // Split by delimiter
        .pipe(
            es.mapSync(line => { // Current child line, without delimiter
                ++childLines; // Debug
                childStream.pause(); // Pause child stream
                currentChild = JSON.parse(line); // Valid JSON child now
                if (prevChild && (currentParent.id === prevChild.id)) { // Check prevChild and currentParent
                    tmpChildren.push(prevChild);
                    prevChild = null;
                }
                if (currentChild && (currentParent.id == currentChild.id)) { // Check currentChild and currentParent
                    console.log('child', currentChild.id);
                    tmpChildren.push(currentChild);
                    // childStream.resume(); // Having this here will cause the stream to stop processing entirely
                } else {
                    // We're here because currentParent does not match currentChild, move to next parent
                    prevChild = currentChild;
                    if (tmpChildren.length > 0) {
                        currentParent['children'] = tmpChildren;
                        tmpChildren = [];
                    }
                    parentStream.resume();
                }
                console.log('currentChild', currentChild.id);
                childStream.resume(); // Having this here will cause the stream to skip children on new parent
            }),
        )
        .on('error', e => {
            console.error(e);
        })
        .on('close', () => {
            console.log('close reading child stream');
        })
        .on('end', () => {
            console.log('end reading child stream');
        });
};

main();

TLDR:在child.id上匹配parent.id,如果不匹配,则保存child。循环到下一个父对象,检查parent.id上的prevChild.id和/或child.id。如果匹配,则将prevChild / child附加到parent [children],然后迭代child。如果不匹配,则迭代到下一个父对象...等

Maybe a graphic will help

0 个答案:

没有答案