Question

我正在使用Nodejs从一个非常大的JSON文件（1GB +）中读取JSON对象。 JSON文件的格式为[{field1：x，field2：x，field3：x}，{...}，...，{...}]。每个对象没有行间隔。为了避免内存问题，我使用fs.createReadStream并按顺序处理每个数据块。这有效，我得到有效的JSON对象，但读取器只读取一个数据块后停止。为什么不读取文件的其余部分？

我的解决方案受到了这个问题中接受的答案的启发：Parse large JSON file in Nodejs

以下是代码：

// Get the JSON file
var fs = require('fs');
var stream = fs.createReadStream('Results.json', {flags: 'r', encoding: 'utf-8'});
var buf = '';
var count = 0;

stream.on('data', function(chunk) {
    console.log("Stream on data!"); // ONLY EXECUTED ONCE
    buf += chunk.toString(); // when data is read, stash it in a string buffer
    process(); // process the buffer
});
stream.on('error', function(err) {
    // NEVER EXECUTED
    console.log(err);
});
stream.on('end', function() {
    // NEVER EXECUTED
    console.log("Count: " + count);
});

function process() {
    var posStart = buf.indexOf('{');
    var posEnd = buf.indexOf('}');

    while (posStart >= 0 || posEnd >= 0) { // keep going until the start or end of the json object in the string
        // IF the start bracket is before the end, skip to the start
        if((posStart < posEnd || posEnd < 0) && posStart >= 0){ 
            buf = buf.slice(posStart);
        }
        if(posStart == 0 && posEnd >= 0){ // IF the end bracket is next
            processObjectString(buf.slice(0, posEnd+1)); // Process the complete object string
            buf = buf.slice(posEnd+1); // Remove the processed string from the buffer
        }
        // Update the positions
        posStart = buf.indexOf('{');
        posEnd = buf.indexOf('}');
    }
}

function processObjectString(objectString) {
    count++;
    var obj = JSON.parse(objectString); // parse the JSON
    console.log(obj.id); // Print object ID (works)
}

修改在修复导致无限while循环的错误之后，以下是迭代JSON文件中所有对象的工作解决方案。它可能不是很优雅，但至少它可以工作（对于任何可能有类似问题的人）。

// Get the JSON file
var fs = require('fs');
var stream = fs.createReadStream('Results.json', {flags: 'r', encoding: 'utf-8'});
var buf = '';
var count = 0;

stream.on('data', function(chunk) {
    buf += chunk.toString(); // when data is read, stash it in a string buffer
    process(buf); // process the buffer
});
stream.on('error', function(err) {
    console.log(err);
});
stream.on('end', function() {
    console.log("Count: " + count);
});

function process() {
    var posStart = buf.indexOf('{');
    var posEnd = buf.indexOf('}');

    while (posStart >= 0 || posEnd >= 0) { // keep going until the start or end of the json object in the string
        // IF the start bracket is before the end, skip to the start
        if((posStart < posEnd || posEnd < 0) && posStart >= 0){ 
            buf = buf.slice(posStart);
        }
        if(posStart == 0 && posEnd >= 0){ // IF the end bracket is next
            processObjectString(buf.slice(0, posEnd+1)); // Process the complete object string
            buf = buf.slice(posEnd+1); // Remove the processed string from the buffer
        }else if(posStart < 0 || posEnd < 0){ // Return to get a new chunk
            return;
        }
        // Update the positions
        posStart = buf.indexOf('{');
        posEnd = buf.indexOf('}');
    }
}

function processObjectString(objectString) {
    count++;
    var obj = JSON.parse(objectString); // parse the JSON
    console.log(obj.id); // Print object ID (works)
}

Answer 1

一些理论

Node.js是异步的，但实际上是单线程的。如果进程在处理收到的数据时遇到困难，它将永远不会收到第二个块，因为 sender 在它可以执行任何操作之前等待释放卡住的线程。

含义

如果process();内的行'data', function(chunk)处于无限循环中，那么您将永远不会收到第二个块，因此它可能看起来像发件人一样懒惰。

未来：尝试始终隔离问题，以确保您在正确的位置。

P.S。在处理文本时，实际上很容易让自己陷入无限循环，我觉得你很痛苦。

Nodejs createReadStream只读取大型JSON文件的一个数据块

1 个答案:

一些理论

含义