在'logs'集合上使用mapReduce来生成HTTP流

时间:2012-07-09 13:49:25

标签: mongodb mapreduce

MongoDB新手问题:

我有很多HTTP日志存储到具有以下数据结构的集合中:

{
    'client': {
        'ip_address': '1.2.3.4',
        'referrer':"http://....",
        'user_agent':'Mozilla..."
    },
    'request':{
        "stream": "stream1",
        "method": "GET",
        "fragment_id": 97,
        "date": 13482181,
    'response':{
        'status':200,
        'size': 654
    }
}

每个文档描述HTTP请求(从客户端到内容流转化器)。由于每个流被分段成小块,我想在我的集合上使用“mapReduce”,然后创建一个“通用流请求”文档,如下所示:

{
    'client_ip': '1.2.3.4',
    'user_agent': 'Mozilla',
    'streams':[
        {
        'stream':"stream1",
        'referrer':'http://...',
        'requests':[
          {
             'fragment_id':97,
             'status':200,
             'date': 13482181,
             'size': 654
             ...
          },
          {
             'fragment_id':98,
             'status':200,
             'date': 13482192,
             'size': 624
             ...
          }, [...]
         ]
        }, [...]
    ]

以下是我的尝试:

map = function(){
    emit({client_ip:this.client.ip,user_agent:this.client.user_agent},{
                stream:this.request.stream,
                referrer:this.client.referer,
                status:this.response.status,
                date:this.request.date,
                size:this.response.total_size,
                fragment_id:this.request.fragment_infos[1]
    });
}

reduce = function(key,values){
    r = {'count':0,'request':[]};
    values.forEach(function(v){
        r.count += 1;
        r.request.push(v);
    });

    return r;
}

但这是我得到的结果:

"_id" : {
    "client_ip" : "1.2.3.4",
    "user_agent" : "Mozilla\/4.0"
 },
 "value" : {
    "client_ip" : "1.2.3.4",
    "user_agent" : "Mozilla\/4.0",
    "count" : 17,
    "request" : {
        "0" : {
            "client_ip" : "1.2.3.4",
            "user_agent" : "Mozilla\/4.0",
            "count" : 2,
            "request" : {
                "0" : {
                    "stream" : "stream1.isml",
                    "referrer" : null,
                    "status" : 200,
                    "date" : 1341706566,
                    "size" : 456,
                    "fragment_id" : null,
                    "count" : 1
                },
                "1" : {
                    "stream" : "stream1.isml",
                    "referrer" : null,
                    "status" : 200,
                    "date" : 1341706566,
                    "size" : null,
                    "fragment_id" : null,
                    "count" : 1
                }
            }
        },
        "1" : {
            "client_ip" : "1.2.3.4",
            "user_agent" : "Mozilla\/4.0",
            "count" : 3,
            "request" : {
                "0" : {
                    "client_ip" : "1.2.3.4",
                    "user_agent" : "Mozilla\/4.0",
                    "count" : 2,
                    "request" : {
                        "0" : {
                            "stream" : "stream1.isml",
                            "referrer" : null,
                            "status" : 200,
                            "date" : 1341706568,
                            "size" : null,
                            "fragment_id" : null,
                            "count" : 1
.........

我哪里错了?

1 个答案:

答案 0 :(得分:1)

您将始终以包含_id和value的记录结束,这是MongoDB map / reduce的属性。有一个打开的票证来改变这种行为: https://jira.mongodb.org/browse/SERVER-2517

只要使值与您的示例对齐,您希望map函数的输出与reduce函数所需的输出形式相同。

map = function(){

  emit({client_ip:this.client.ip,user_agent:this.client.user_agent},{
    client_ip: this.client.ip,
    user_agent: this.client.user_agent,
    streams: {
      this.request.stream: {
        referrer: this.client.referer,
        requests: [
          {
            fragment_id: this.request.fragment_infos[1],
            status:this.response.status,
            date:this.request.date,
            size:this.response.total_size  
          }
        ]
      }
    }
  });
}

您需要修改reduce函数以合并此表单的多个文档。如有必要,编写一个finalize函数将流的哈希值转换为流数组,并在每个元素中包含流名称。