聚合计算每个标签的数量,其中有两种类型的标签

时间:2014-05-26 05:27:18

标签: mongodb mapreduce aggregation-framework

我有一个包含可以简化的元素的集合:

{
  t1 : [1, 3, 6],
  t2 : [8, 9]
}

位于t1t2可以是1到10个正整数,没有重复。我需要计算集合中所有文档中t1和t2中每个数字的数量。

例如,如果我的收藏包含3个文件:

{
  t1 : [1, 3, 6],
  t2 : [8, 9]
}, {
  t1 : [1, 2],
  t2 : [8]
}, {
  t1 : [6],
  t2 : [8, 1]
}

我应该有像

这样的东西
t1 : {
   1 : 2,   // there are 2 elements of 1 in t1
   3 : 1,   // there is 1 element of 3 in t1
   6 : 2,
   2 : 1
}

t2 : {
  8 : 3,    // there are 3 elements of 8 in t2
  9 : 1,
  1 : 1
}

我目前正在做的是这样的事情:

var t1 = {}, t2 = {};
db.coll.find().forEach(function(e){
   // where I am iterating through each element in t1 and t2 to
   // and increase/populate values in t1 and t2
})

虽然这种方法没有任何问题,但我认为聚合框架有更好的方法。是否可以仅使用聚合一次?

P.S。我在示例中显示的输出只是一个示例。任何可以提供我需要的信息的输出都是合适的。

2 个答案:

答案 0 :(得分:2)

汇总的单一表格:

db.tags.aggregate([
    { "$project": {
        "_id": 0,
        "t1": 1,
        "t2": 2,
        "type": { "$literal": ["t1","t2"] }
    }},
    { "$unwind": "$type" },
    { "$project": {
        "type": 1,
        "value": { 
            "$cond": [
                { "$eq": [ "$type", "t1" ] },
                "$t1",
                "$t2"
            ]
        } 
    }},
    { "$unwind": "$value" },
    { "$group": {
        "_id": {
            "type": "$type",
            "value": "$value"
        },
        "count": { "$sum": 1 }
    }},
    { "$sort": { "_id.type": 1, "_id.value": 1 } }
])

输出:

{ "_id" : { "type" : "t1", "value" : 1 }, "count" : 2 }
{ "_id" : { "type" : "t1", "value" : 2 }, "count" : 1 }
{ "_id" : { "type" : "t1", "value" : 3 }, "count" : 1 }
{ "_id" : { "type" : "t1", "value" : 6 }, "count" : 2 }
{ "_id" : { "type" : "t2", "value" : 1 }, "count" : 1 }
{ "_id" : { "type" : "t2", "value" : 8 }, "count" : 3 }
{ "_id" : { "type" : "t2", "value" : 9 }, "count" : 1 }

或者如果您更喜欢单个文档,只需使用$group$project替换结束阶段:

    { "$group": {
        "_id": null,
        "t1": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.type", "t1" ] },
                    { "value": "$_id.value", "count": "$count" },
                    false
                ]
            }
        },
        "t2": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.type", "t2" ] },
                    { "value": "$_id.value", "count": "$count" },
                    false
                ]
            }
        },
    }},
    { "$project": {
        "_id": 0,
        "t1": { "$setDifference": [ "$t1", [false] ] },
        "t2": { "$setDifference": [ "$t2", [false] ] }
    }}

结果:

{ 
    "t1" : [ 
        { "value" : 2, "count" : 1 }, 
        { "value" : 6, "count" : 2 }, 
        { "value" : 3, "count" : 1 }, 
        { "value" : 1, "count" : 2 } 
    ], 
    "t2" : [ 
        { "value" : 1, "count" : 1 },
        { "value" : 9, "count" : 1 },
        { "value" : 8, "count" : 3 } 
    ] 
}

如果不使用MongoDB 2.6中的新运算符,可以实现这些功能,只需要更多的工作。


mapReduce方式看起来相当简单。由于mapReduce的限制,输出当然不是您的格式,但它得到的结果没有迭代查询:

db.collection.mapReduce(
    function () {
      delete this["_id"];

      for ( var k in this ) {
        var list = this[k];
        list.forEach(function(v) {
          emit( { k: k , v: v }, 1 );
        });
      }
    },
    function (key,values) {
      return values.length;
    },
    { "out": { "inline": 1 } }
)

输出结果为:

{ "_id" : { "k" : "t1", "v" : 1 }, "value" : 2 }
{ "_id" : { "k" : "t1", "v" : 2 }, "value" : 1 }
{ "_id" : { "k" : "t1", "v" : 3 }, "value" : 1 }
{ "_id" : { "k" : "t1", "v" : 6 }, "value" : 2 }
{ "_id" : { "k" : "t2", "v" : 1 }, "value" : 1 }
{ "_id" : { "k" : "t2", "v" : 8 }, "value" : 3 }
{ "_id" : { "k" : "t2", "v" : 9 }, "value" : 1 }

还取决于您是否需要灵活处理“关键”名称。

答案 1 :(得分:1)

db.nr.aggregate([ { $unwind: "$t1" }, { $group: { '_id': '$t1','count' : { '$sum':1 } } }, {    $project : {_id: 0, t1: '$_id', count:1}}, { $sort: { t1:1 } } ])
"count" : 2, "t1" : 1 }
"count" : 1, "t1" : 2 }
"count" : 1, "t1" : 3 }
"count" : 2, "t1" : 6 }

db.nr.aggregate([ { $unwind: "$t2" }, { $group: { '_id': '$t2','count' : { '$sum':1 } } }, { $project : { _id: 0, t2: '$_id', count:1 } }, { $sort: { t2:1 } } ])
"count" : 1, "t2" : 1 }
"count" : 3, "t2" : 8 }
"count" : 1, "t2" : 9 }