使用MongoDB map-reduce生成展平文档

时间:2015-08-24 01:16:26

标签: javascript mongodb mapreduce mongodb-query aggregation-framework

我从一组看起来像这样的文件开始:

{
  state: 'CA',
  year: 2014,
  accepted: true
}
{
  state: 'AL',
  year: 2012,
  accepted: false
}
{
  state: 'CA',
  year: 2013,
  accepted: false
}
...

我希望以这种格式结束新的聚合集合:

{
  _id: 'CA',
  value: {
    submittedApplications2012: 34,
    submittedApplications2013: 23,
    submittedApplications2014: 72,
    acceptedApplications2012: 12,
    acceptedApplications2013: 7,
    acceptedApplications2014: 5
  }
}
{
  _id: 'AL',
  value: {
    submittedApplications2012: 73,
    submittedApplications2013: 67,
    submittedApplications2014: 98,
    acceptedApplications2012: 45,
    acceptedApplications2013: 34,
    acceptedApplications2014: 31
  }
}

我编写了一个mapreduce,它按状态名称对文档进行分组,并循环遍历每个状态,增加相应的属性:

var map = function() {
  var key = this.state;
  var value = {
    year: this.year,
    accepted: this.accepted
  };
  emit(key, value);
};

var reduce = function(key, values) {
  var reducedObject = {
    submittedApplications2012: 0,
    submittedApplications2013: 0,
    submittedApplications2014: 0,
    acceptedApplications2012: 0,
    acceptedApplications2013: 0,
    acceptedApplications2014: 0
  };

  values.forEach(function(v) {
    switch (v.year) {
      case 2014:
        reducedObject.submittedApplications2014++;
        if (v.accepted) {
          reducedObject.acceptedApplications2014++;
        }
        break;
      case 2013:
        reducedObject.submittedApplications2013++;
        if (v.accepted) {
          reducedObject.acceptedApplications2013++;
        }
        break;
      case 2012:
        reducedObject.submittedApplications2012++;
        if (v.accepted) {
          reducedObject.acceptedApplications2012++;
        }
        break;
      default:
    }
  });
  return reducedObject;
};

db.test_collection.mapReduce(
  map,
  reduce,
  {out: {inline: 1}}
)

不幸的是,结果不准确。对于submitted2012submitted2013submitted2014,阿拉巴马州最终得到9,8和3。其他州也以低数字结束。有10,000条记录,数字应该更高。

我认为这种情况正在发生,因为reduce函数被多次调用(参见Reduce is called several times with the same key in mongodb map-reduce),reducedObject对象在后续传递中被覆盖。

如何防止这种情况,以便准确计算提交和接受的申请数量?

以下是一些以原始格式创建测试集合的代码:

// Generate a test collection with 10K documents for demo'ing purposes
var i = 10000,
    states = ['AL', 'CA', 'FL', 'TN', 'OH'],
    years = [2012, 2013, 2014];
db.test_collection.drop();
while (i--) {
  db.test_collection.insert({
    state: states[Math.floor(Math.random() * states.length)],
    year: NumberInt(years[Math.floor(Math.random() * years.length)]),
    accepted: Math.random() >= 0.5
  });
}

2 个答案:

答案 0 :(得分:1)

我真的不认为mapReduce是正确的选择。 Peronally我将使用聚合框架,因为它将在这里处理得更快,因为操作都是本机代码,没有代码或对象的JavaScript转换。

这样做只是一个简单的$group操作,$cond会对true/false值进行一些处理,将db.test_collection.aggregate([ { "$group": { "_id": { "state": "$state", "year": "$year" }, "submitted": { "$sum": 1 }, "accepted": { "$sum": { "$cond": [ "$accepted", 1, 0 ] } } }}, { "$group": { "_id": "$_id.state", "values": { "$push": { "year": "$_id.year", "submitted": "$submitted", "accepted": "$accepted" } } }} ]) 值转换为数字:

{
    "_id" : "CA",
    "values" : [
        {
                "year" : 2014,
                "submitted" : 691,
                "accepted" : 360
        },
        {
                "year" : 2013,
                "submitted" : 653,
                "accepted" : 332
        },
        {
                "year" : 2012,
                "submitted" : 681,
                "accepted" : 350
        }
    ]
}

产生这样的输出(为简洁起见,只有一个状态):

db.test_collection.aggregate([
    { "$group": {
        "_id": "$state",
        "submitted2012": { 
            "$sum": {
                "$cond": [
                    { "$eq": [ "$year", 2012 ] },
                    1,
                    0
                ]
            }
        },
        "accepted2012": {
            "$sum": {
                "$cond": [
                    { "$and": [
                        { "$eq": [ "$year", 2012 ] },
                        "$accepted"
                    ]},
                    1,
                    0
                ]
            }
        },
        "submitted2013": { 
            "$sum": {
                "$cond": [
                    { "$eq": [ "$year", 2013 ] },
                    1,
                    0
                ]
            }
        },
        "accepted2013": {
            "$sum": {
                "$cond": [
                    { "$and": [
                        { "$eq": [ "$year", 2013 ] },
                        "$accepted"
                    ]},
                    1,
                    0
                ]
            }
        },
        "submitted2014": { 
            "$sum": {
                "$cond": [
                    { "$eq": [ "$year", 2014 ] },
                    1,
                    0
                ]
            }
        },
        "accepted2014": {
            "$sum": {
                "$cond": [
                    { "$and": [
                        { "$eq": [ "$year", 2014 ] },
                        "$accepted"
                    ]},
                    1,
                    0
                ]
            }
        }
    }}
])

或者,如果您确实必须在输出中指定所有键,请使用以下表单。在代码中生成结构作为"数据结构"是一件很重要的事情。是一个聚合管道或实际上任何本机MongoDB查询实际上是:

var groupStage  = {
    "$group": {
        "_id": "$state"
    }
};

[2012,2013,2014].forEach(function(year) {
    groupStage["$group"]["submitted" + year] = {
        "$sum": {
            "$cond": [
                { "$eq": [ "$year", year ] },
                1,
                0
            ]
        }
    };
    groupStage["$group"]["accepted" + year] = {
        "$sum": {
            "$cond": [
                { "$and": [
                    { "$eq": [ "$year", year ] },
                    "$accepted"
                ]},
                1,
                0
            ]
        }
    };
});

db.test_collection.aggregate([groupStage])

事实上,它实际上就像这样微不足道:

{
    "_id" : "CA",
    "submitted2012" : 681,
    "accepted2012" : 350,
    "submitted2013" : 653,
    "accepted2013" : 332,
    "submitted2014" : 691,
    "accepted2014" : 360
}

它的输出:

db.test_collection.mapReduce(
    function() {
        var obj = {};
        obj["submitted" + this.year] = 1,
        obj["accepted" + this.year] = (this.accepted) ? 1: 0;
        emit(this.state,obj);
    },
    function(key,values) {
        var obj = {};
        values.forEach(function(value) {
            Object.keys(value).forEach(function(key) {
                if ( !obj.hasOwnProperty(key) )
                    obj[key] = 0;
                obj[key] += value[key];
            });
        });
        return obj;
    },
    { "out": { "inline": 1 } }
)

使用mapReduce执行此操作要慢得多,但您需要做的主要考虑因素是使用"映射器"发出与reducer本身将返回相同的输出。这是因为"减速机"实际上并不是一次处理所有分组文档,而是输出"减少"可以反过来作为"输入"与其他排放的和#34;减少的"进一步减少的价值:

{
    "_id" : "CA",
    "value" : {
            "submitted2014" : 691,
            "accepted2014" : 360,
            "submitted2013" : 653,
            "accepted2013" : 332,
            "submitted2012" : 681,
            "accepted2012" : 350
    }
}

有了这种输出:

db.test_collection.mapReduce(
    function() {
        var obj = {
            "year": this.year,
            "submitted": 1,
            "accepted": (this.accepted) ? 1 : 0
        };
        emit(this.state,{ "values": [obj] });
    },
    function(key,values) {
        var obj = { "values": [] };

        var accum = {};

        values.forEach(function(value) {
            value.values.forEach(function(data) {
                if ( !accum.hasOwnProperty(data.year) )
                    accum[data.year] = {
                        submitted: 0,
                        accepted: 0
                    };
                accum[data.year]["submitted"] += data.submitted;
                accum[data.year]["accepted"] += data.accepted;
            });
        });

        Object.keys(accum).forEach(function(key) {
            obj.values.push({
                "year": parseInt(key),
                "submitted": accum[key].submitted,
                "accepted": accum[key].accepted
            });
        });
        obj.values.sort(function(a,b){
            return a.year < b.year;
        });

        return obj;
    },
    { "out": { "inline": 1  } }
)

对于记录,可以像这样获得类似原始聚合示例的输出:

{
    "_id" : "CA",
    "value" : {
        "values" : [
            {
                    "year" : 2014,
                    "submitted" : 691,
                    "accepted" : 360
            },
            {
                    "year" : 2013,
                    "submitted" : 653,
                    "accepted" : 332
            },
            {
                    "year" : 2012,
                    "submitted" : 681,
                    "accepted" : 350
            }
        ]
    }
}

使用mapReduce规则后面的输出键:

var i = 10000,
    states = ['AL', 'CA', 'FL', 'TN', 'OH'],
    years = [2012, 2013, 2014],
    bulk = db.test_collection.initializeOrderedBulkOp();

db.test_collection.drop();
while (i--) {
  bulk.insert({
    state: states[Math.floor(Math.random() * states.length)],
    year: NumberInt(years[Math.floor(Math.random() * years.length)]),
    accepted: Math.random() >= 0.5
  });
  if ( i % 1000 == 0 ) {
    bulk.execute();
    bulk = db.test_collection.initializeOrderedBulkOp();
  }
}

因此可以使用mapReduce,但聚合框架无疑是此类任务的更好选择。

使用Bulk操作时,您的生成脚本也可能更好一些:

class Task extends Model
{
    public function category() {
        return $this->belongsTo('Category');
    }

    public function scopeCategoryType($query, $cat_type) {
        $query->whereHas('category', function($q) use($cat_type) {
            $q->where('type', $cat_type);
        }
    } 
}

class Category extends Model
{
    public function tasks() {
        return $this->hasMany('Task');
    }

    public function scopeType($query, $cat_type) {
        $query->where('type', $cat_type);
    }
}

答案 1 :(得分:0)

您可以在Aggregation运算符的帮助下实现:

让我们创建一个以下集合:

收藏:

db.flat.insert({state: "CA",year: 2014,accepted: true});
db.flat.insert({state: "AL",year: 2012,accepted: false});
db.flat.insert({state: "CA",year: 2013,accepted: false});
db.flat.insert({state: "AL",year: 2012,accepted: true});
db.flat.insert({state: "CA",year: 2011,accepted: false});
db.flat.insert({state: "AL",year: 2011,accepted: true});
db.flat.insert({state: "CA",year: 2013,accepted: false});
db.flat.insert({state: "AL",year: 2014,accepted: true});
db.flat.insert({state: "CA",year: 2014,accepted: false});
db.flat.insert({state: "AL",year: 2014,accepted: true});
db.flat.insert({state: "CA",year: 2014,accepted: false});
db.flat.insert({state: "AL",year: 2014,accepted: true});
db.flat.insert({state: "CA",year: 2014,accepted: false});

在查询下方将获取所需的输出:

查询:

          db.flat.aggregate([
            { $group: { _id: { state:"$state",year:"$year" }, 
                       submitted:{ $sum : 1 }, 
                       accepted: { $sum : 
                              { 
                                $cond: { if: { $eq:[ "$accepted",true ] },
                                         then: 1 , 
                                         else: 0 
                                       } 
                              }
                                 } 
                      } 
            },
            { $project: { temp: { 
                               $concat: [ "submittedApplications",
                                 { $substr:[ "$_id.year", 0, 4 ] }," : ",
                                 { $substr:[ "$submitted",0,1 ] }, "  ,  " ,
                                { 
                               $concat:[ "acceptedApplications",
                                 { $substr: [ "$_id.year",0,4 ] }," : ", 
                                 { $substr: [ "$accepted",0,1 ] }
                                        ]
                                }
                                        ]
                                }
                        }
           },
           { $group: { _id:"$_id.state" , value : {$push:"$temp"} 
                     } 
           }
         ]).pretty();

OutPut:

{
        "_id" : "CA",
        "value" : [
                "submittedApplications2011 : 1  ,  acceptedApplications2011 : 0",
                "submittedApplications2013 : 2  ,  acceptedApplications2013 : 0",
                "submittedApplications2014 : 4  ,  acceptedApplications2014 : 1"
        ]
}
{
        "_id" : "AL",
        "value" : [
                "submittedApplications2011 : 1  ,  acceptedApplications2011 : 1",
                "submittedApplications2012 : 2  ,  acceptedApplications2012 : 1",
                "submittedApplications2014 : 3  ,  acceptedApplications2014 : 3"
        ]
}