分析node.js中的大型json日志文件

时间:2016-09-13 01:40:24

标签: javascript json node.js

我有以下JSON文件:

sensorlogs.json
{"arr":[{"UTCTime":10000001,"s1":22,"s2":32,"s3":42,"s4":12},
{"UTCTime":10000002,"s1":23,"s2":33,"s4":13},
{"UTCTime":10000003,"s1":24,"s2":34,"s3":43,"s4":14},
{"UTCTime":10000005,"s1":26,"s2":36,"s3":44,"s4":16},
{"UTCTime":10000006,"s1":27,"s2":37,"s4":17},
{"UTCTime":10000004,"s1":25,"s2":35,"s4":15},
...
{"UTCTime":12345678,"s1":57,"s2":35,"s3":77,"s4":99}
]}

传感器s1,s2,s3等都以不同的频率发送(请注意,s3每2秒发送一次,并且时间安排可能无序)。

我怎样才能实现像 -

这样的东西
Analyzing s1:
s = [[10000001, 22], [10000002, 23],.. [12345678,57]]
s1 had 2 missing entries
Analyzing s2:
s = [[10000001, 32], [10000002, 33],.. [12345678,35]]
s2 had 0 missing entries
Analyzing s3:
s = [[10000001, 42], [10000003, 43],.. [12345678,77]]
s3 had 0 missing entries
Analyzing s4:
s = [[10000001, 12], [10000003, 13],.. [12345678,99]]
s4 had 1 missing entries

sensorlogs.json是16 GB。

可以根据连续UTC时间戳的差异找到缺少的条目。每个传感器以已知频率传输。

由于内存限制,我无法使用多个大型数组进行分析,因此我必须对同一个JSON日志文件进行多次传递,并且只使用单个大型数组进行分析。

我现在所拥有的是 -

var result = [];
//1. Extract all the keys from the log file
console.log("Extracting keys... \n");
var stream = fs.createReadStream(filePath);
var lineReader = lr.createInterface(
{
  input: stream
});

lineReader.on('line', function (line) 
{
  getKeys(line);//extract all the keys from the JSON
});
stream.on('end', function()
{
  //obj -> arr
  for(var key in tmpObj)
    arrStrm.push(key);

  //2. Validate individual sensors
  console.log("Validating the sensor data ...\n");

  //Synchronous execution of the sensors in the array
  async.each(arrStrm, function(key)
  {
    {
        currSensor = key;
        console.log("validating " + currSensor + "...\n");

        stream = fs.createReadStream(filePath);
        lineReader = lr.createInterface(
        {
          input: stream
        });

        lineReader.on('line', function (line) 
        {
          processLine(line);//Create the arrays for the sensors
        });
        stream.on('end', function()
        {
            processSensor(currSensor);//Process the data for the current sensor
        });
    }
  });
});

function getKeys(line) 
{
    if(((pos = line.indexOf('[')) >= 0)||((pos = line.indexOf(']')) >= 0))
        return;
    if (line[line.length-1] == '\r') line=line.substr(0,line.length-1); // discard CR (0x0D)
    if (line[line.length-1] == ',') line=line.substr(0,line.length-1); // discard ,
//  console.log(line);

    if (line.length > 1) 
    { // ignore empty lines
        var obj = JSON.parse(line); // parse the JSON
        for(var key in obj) 
        {
            if(key != "debug")
            {
                if(tmpObj[key] == undefined)
                    tmpObj[key]=[];
            }
        };
    }
}

当然这不起作用,我无法在网上找到任何解释如何实施的内容。

注意: 我可以选择我选择的任何语言来开发这个工具(C / C ++,C#/ Java / Python),但是我会使用JavaScript,因为它能够轻松解析JSON数组(并且我有兴趣改进它在JS中也是如此)。 如果JavaScript不是最好的语言制作这样的工具,是否有人喜欢建议使用替代语言?

编辑:一些重要的信息要么不是很清楚,要么我之前没有提到,但看起来很重要的是要包含在问题中 -

  1. JSON日志中的数据不是实时流式传输,而是存储在硬盘中的JSON文件
  2. 存储的数据不是按时间顺序排列的,这意味着时间戳可能没有按正确的顺序排列。因此,每个传感器数据需要根据 >存储在数组中后的时间戳进行排序
  3. 我不能为每个传感器使用单独的数组(这与在RAM中存储整个16 GB JSON相同),为了节省内存,一次只能使用一个数组。是的,我的日志中有超过4个传感器,这只是一个样本(大约20个给出一个想法)
  4. 我修改了我的JSON和预期输出

    一种解决方案可能是对JSON文件进行多次传递,一次将一个传感器数据与时间戳存储在一个数组中,然后对数组进行排序,最后分析数据是否存在损坏和间隙。这就是我在上面的代码中尝试做的事情

2 个答案:

答案 0 :(得分:3)

所以你有16GB的大胖传感器记录包裹在json中。

首先,16GB的整个json文件是不现实的,只是因为开始和结束括号打破了规律性并变成了数组中令人讨厌的字符。我们知道该文件有开头和结尾,而且,没有它们,您的程序可以处理文件的块,甚至可以直接插入到设备的流上。所以让我们假设我们将要处理的是:

{"UTCTime":10000001,"s1":22,"s2":32,"s3":42,"s4":12},
{"UTCTime":10000002,"s1":23,"s2":33,"s4":13},
{"UTCTime":10000003,"s1":24,"s2":34,"s3":43,"s4":14},
...
{"UTCTime":12345678,"s1":57,"s2":35,"s3":77,"s4":99},

甚至在最后添加或检测到丢失的逗号也不会太困难。

现在每行都以相同的方式格式化,并且可以解释为json。 问题是:传感器在预期时输出数据吗?如果我们确定他们在正确的时间和正确的频率发言(案例1),但有时他们可能会错过写作,一切都很顺利。然而,如果他们开始在时间范围内轻微滑动(情况2),那么需要某种启发式来恢复正确的线路频率并且分析将更长。

如果我们没有处理这个实时,那么对文件进行第一次简单的验证检查就是判断每条freq行是否找到了预期的传感器数据,对吗?

在任何情况下,由于它是一个非常大的文件,因此必须尽可能逐行处理。

在以下程序中,我只考虑了案例1,并且我们可以处理连续流。

#!/usr/bin/python
import json

sensors={}
sensors['s1']=[1] # frequencies
sensors['s2']=[1]
sensors['s3']=[2]
sensors['s4']=[1]

# append data array and error counter at sensors[i]
# it holds [freq,err,data]
for k,v in sensors.iteritems(): sensors[k].extend([0,[]])
FRQ=0;ERR=1;DAT=2
print list(sorted(sensors.items()))
S=list(sorted(sensors.keys()))

with open('./sensors.json', "r") as stream:
    i=0
    for line in stream:
      if not line.rstrip(): continue # skip blank lines
      j=json.loads(line[:-2]) # skip comma and \n
      t=j["UTCTime"]
      for k in S:
          sensor=sensors[k]
          if i%sensor[FRQ]==0 : # every Nth iteration
            v=j.get(k)
            if v is None:
                sensor[ERR]+=1
                print k,"has",sensor[ERR],"missing entries"
            sensor[DAT].append([t,v]) # append that sensor data
            # filling up the memory...
      i+=1

for k,v in sorted(sensors.iteritems()): print k,sensors[k][DAT]
for k,v in sorted(sensors.iteritems()): print k,'had',sensors[k][ERR],"missing entries"

要处理案例2,我们将使用模数检查反转None检查,验证传感器是否在不应该写入时写入某些内容,然后尝试检测移位。

最后注意:您的程序可能会缩短内存,因此将整个数据保留在内存中并不是一个好主意。如果打算为每个传感器使用单独的阵列进行进一步处理,那么将它们写入文件可能更明智。

答案 1 :(得分:2)

再次编辑以考虑您的修改:

var fs = require('fs');
var stream = fs.createReadStream('sensorlogs.json', {flags: 'r', encoding: 'utf-8'});
var buffer = '';
var sensor = process.argv[2];
var readings = [];
var missingCont = 0;
console.log('Analizying ' + sensor + ':');

stream.on('data', function(d) {
    buffer += d.toString();
    processBuffer();
    console.log(readings);
    console.log(sensor + ' had ' + missingCont + ' missing entries');
});

function processBuffer() {
  buffer = buffer.slice(buffer.indexOf('[{'));
  while(buffer.indexOf('{') != -1) {
    buffer = buffer.slice(buffer.indexOf('{"'));
    processLine(buffer.slice(0, buffer.indexOf('}') + 1));
    buffer = buffer.slice(buffer.indexOf('}') + 2);
  }
};

function processLine(line) {
  if(line != ""){
    var obj = JSON.parse(line);
    if(!obj[sensor]){
      missingCont++;
    }else{
      var pos;
      for(pos = 0; pos < readings.length; pos++){
        if(obj.UTCTime < readings[pos][0]){
          var reading = [obj.UTCTime, obj[sensor]]
          readings.splice(pos, 0, reading);
          break;
        }
      }
      if(pos == readings.length){
        readings.push([obj.UTCTime, obj[sensor]]);
      }
    }
  }
};

您必须使用您想要分析的传感器的参数调用它:

node.exe scripts\processJson.js <param>

为了测试它我拿了这个样本:

{"arr":[{"UTCTime":10000001,"s1":22,"s2":32,"s3":42,"s4":12},
{"UTCTime":10000005,"s1":20,"s2":30,"s3":40,"s4":10},
{"UTCTime":10000002,"s1":23,"s2":33,"s4":13},
{"UTCTime":10000003,"s1":24,"s2":34,"s3":43,"s4":14},
{"UTCTime":12345678,"s1":57,"s2":35,"s3":77,"s4":99}
]}

输出结果为:

> node.exe scripts\processJson.js s1
Analizying s1:
[[10000001, 22], [10000002, 23], [10000003, 24], [10000005, 20], [12345678, 57]]
s1 had 0 missing entries

> node.exe scripts\processJson.js s2
Analizying s2:
[[10000001, 32], [10000002, 33], [10000003, 34], [10000005, 30], [12345678, 35]]
s2 had 0 missing entries

> node.exe scripts\processJson.js s3
Analizying s3:
[[10000001, 42], [10000003, 43], [10000005, 40], [12345678, 77]]
s3 had 1 missing entries

> node.exe scripts\processJson.js s4
Analizying s4:
[[10000001, 12], [10000002, 13], [10000003, 14], [10000005, 10], [12345678, 99]]
s4 had 0 missing entries