在单个结果中组合聚合操作

时间:2015-08-02 23:25:53

标签: mongodb mongoose mongodb-query aggregation-framework

我有两个我想要合并的集合操作。第一个操作返回,例如:

{ "_id" : "Colors", "count" : 12 }
{ "_id" : "Animals", "count" : 6 }

并且第二个操作返回,例如:

{ "_id" : "Red", "count" : 10 }
{ "_id" : "Blue", "count" : 9 }
{ "_id" : "Green", "count" : 9 }
{ "_id" : "White", "count" : 7 }
{ "_id" : "Yellow", "count" : 7 }
{ "_id" : "Orange", "count" : 7 }
{ "_id" : "Black", "count" : 5 }
{ "_id" : "Goose", "count" : 4 }
{ "_id" : "Chicken", "count" : 3 }
{ "_id" : "Grey", "count" : 3 }
{ "_id" : "Cat", "count" : 3 }
{ "_id" : "Rabbit", "count" : 3 }
{ "_id" : "Duck", "count" : 3 }
{ "_id" : "Turkey", "count" : 2 }
{ "_id" : "Elephant", "count" : 2 }
{ "_id" : "Shark", "count" : 2 }
{ "_id" : "Fish", "count" : 2 }
{ "_id" : "Tiger", "count" : 2 }
{ "_id" : "Purple", "count" : 1 }
{ "_id" : "Pink", "count" : 1 }

如何组合这两项操作以实现以下目标?

{ "_id" : "Colors", "count" : 12, "items" :
    [
        { "_id" : "Red", "count" : 10 },
        { "_id" : "Blue", "count" : 9 },
        { "_id" : "Green", "count" : 9 },
        { "_id" : "White", "count" : 7 },
        { "_id" : "Yellow", "count" : 7 },
        { "_id" : "Orange", "count" : 7 },
        { "_id" : "Black", "count" : 5 },
        { "_id" : "Grey", "count" : 3 },
        { "_id" : "Purple", "count" : 1 },
        { "_id" : "Pink", "count" : 1 }
    ]
},
{ "_id" : "Animals", "count" : 6, "items" :
    [
        { "_id" : "Goose", "count" : 4 },
        { "_id" : "Chicken", "count" : 3 },
        { "_id" : "Cat", "count" : 3 },
        { "_id" : "Rabbit", "count" : 3 },
        { "_id" : "Duck", "count" : 3 },
        { "_id" : "Turkey", "count" : 2 },
        { "_id" : "Elephant", "count" : 2 },
        { "_id" : "Shark", "count" : 2 },
        { "_id" : "Fish", "count" : 2 },
        { "_id" : "Tiger", "count" : 2 }
    ]
}

模式

var ListSchema = new Schema({
    created: {
        type: Date,
        default: Date.now
    },
    title: {
        type: String,
        default: '',
        trim: true,
        required: 'Title cannot be blank'
    },
    items: {
        type: Array,
        default: [String],
        trim: true
    },
    creator: {
        type: Schema.ObjectId,
        ref: 'User'
    }
});

操作1

db.lists.aggregate(
      [
        { $group: { _id: "$title", count: { $sum: 1 } } },
        { $sort: { count: -1 } }
      ]
    )

操作2

db.lists.aggregate(
      [
        { $unwind: "$items" },
        { $group: { _id: "$items", count: { $sum: 1 } } },
        { $sort: { count: -1 } }
      ]
    )

1 个答案:

答案 0 :(得分:3)

这实际上取决于您在重复过程中所获得的结果类型。你问的事情似乎表明你正在寻找"方面的数量"结果,但我稍后会谈到这一点。

作为基本结果,作为一种方法,这没有任何问题:

    Thing.aggregate(
      [
        { "$group": {
          "_id": {
            "type": "$type", "name": "$name"
          },
          "count": { "$sum": 1 }
        }},
        { "$group": {
          "_id": "$_id.type",
          "count": { "$sum": "$count" },
          "names": {
            "$push": { "name": "$_id.name", "count": "$count" }
          }
        }}
      ],
      function(err,results) {
        console.log(JSON.stringify(results, undefined, 2));
        callback(err);
      }
    )

哪个应该给你这样的结果:

[
  {
    "_id": "colours",
    "count": 50102,
    "names": [
      { "name": "Green",  "count": 9906  },
      { "name": "Yellow", "count": 10093 },
      { "name": "Red",    "count": 10083 },
      { "name": "Orange", "count": 9997  },
      { "name": "Blue",   "count": 10023 }
    ]
  },
  {
    "_id": "animals",
    "count": 49898,
    "names": [
      { "name": "Tiger",    "count": 9710  },
      { "name": "Lion",     "count": 10058 },
      { "name": "Elephant", "count": 10069 },
      { "name": "Monkey",   "count": 9963  },
      { "name": "Bear",     "count": 10098 }
    ]
  }
]

这里最基本的方法是在两个阶段中简单地$group,其中第一阶段将密钥组合聚合到最低(最细粒度)分组级别,然后处理{{1}再次基本上"加起来"最高(最小颗粒)分组级别的总计,也因此将较低的结果添加到项目数组中。

但这不是"分开"因为它会在" facet计数",所以这样做会变得有点复杂,而且会更加疯狂。但首先是例子:

$group

这将产生如下输出:

    Thing.aggregate(
      [
        { "$group": {
          "_id": {
            "type": "$type",
            "name": "$name"
          },
          "count": { "$sum": 1 }
        }},
        { "$group": {
          "_id": "$_id.type",
          "count": { "$sum": "$count" },
          "names": {
            "$push": { "name": "$_id.name", "count": "$count" }
          }
        }},
        { "$group": {
          "_id": null,
          "types": {
            "$push": {
              "type": "$_id", "count": "$count"
            }
          },
          "names": { "$push": "$names" }
        }},
        { "$unwind": "$names" },
        { "$unwind": "$names" },
        { "$group": {
          "_id": "$types",
          "names": { "$push": "$names" }
        }},
        { "$project": {
          "_id": 0,
          "facets": {
            "types": "$_id",
            "names": "$names",
          },
          "data": { "$literal": [] }
        }}
      ],
      function(err,results) {
        console.log(JSON.stringify(results[0], undefined, 2));
        callback(err);
      }
    );

虽然是"可能",那种"杂耍"在管道中进行此处生成此输出格式并不是非常有效。与第一个示例相比,这里有很多开销只是为了简单地将结果拆分成它们自己的数组响应并且独立于分组键。对于更多的方面而言,这显然变得更加复杂。生成。

正如输出中暗示的那样,人们通常会问的问题是" facet计数"是结果"数据"除了聚合方面之外,还包括在响应中(可能是分页的)。所以进一步的复杂性应该在这里显而易见:

{
  "facets": {
    "types": [
      { "type": "colours", "count": 50102 },
      { "type": "animals", "count": 49898 }
    ],
    "names": [
      { "name": "Green",    "count": 9906  },
      { "name": "Yellow",   "count": 10093 },
      { "name": "Red",      "count": 10083 },
      { "name": "Orange",   "count": 9997  },
      { "name": "Blue",     "count": 10023 },
      { "name": "Tiger",    "count": 9710  },
      { "name": "Lion",     "count": 10058 },
      { "name": "Elephant", "count": 10069 },
      { "name": "Monkey",   "count": 9963  },
      { "name": "Bear",     "count": 10098 }
    ]
  },
  "data": []
}

这种操作的要求基本上是"东西"将每条数据转换为单个对象。在大多数情况下,当然你想要结果中的实际数据(在此示例中使用100,000),遵循这种方法变得完全不切实际,并且几乎肯定会超过16MB的BSON文档限制大小。

在这种情况下,你想要产生结果和" facets"这些数据在响应中,那么这里最好的方法是将每个聚合和输出页面作为单独的查询操作运行,并且" stream"输出JSON(或其他格式)返回给接收客户端。

作为一个自包含的例子:

        { "$group": {
          "_id": null,
          (...)

输出如:

var async = require('async'),
    mongoose = require('mongoose'),
    Schema = mongoose.Schema;


mongoose.connect('mongodb://localhost/things');

var data = {
      "colours": [
        "Red","Blue","Green","Yellow","Orange"
      ],
      "animals": [
        "Lion","Tiger","Bear","Elephant","Monkey"
      ]
    },
    dataKeys = Object.keys(data);

var thingSchema = new Schema({
  "name": String,
  "type": String
});

var Thing = mongoose.model( 'Thing', thingSchema );

var writer = process.stdout;

mongoose.connection.on("open",function(err) {
  if (err) throw err;
  async.series(
    [
      function(callback) {
        process.stderr.write("removing\n");
        Thing.remove({},callback);
      },
      function(callback) {
        process.stderr.write("inserting\n");
        var bulk = Thing.collection.initializeUnorderedBulkOp(),
            count = 0;

        async.whilst(
          function() { return count < 100000; },
          function(callback) {
            var keyLen    = dataKeys.length,
                keyIndex  = Math.floor(Math.random(keyLen)*keyLen),
                type      = dataKeys[keyIndex],
                types     = data[type],
                typeLen   = types.length,
                nameIndex = Math.floor(Math.random(typeLen)*typeLen),
                name      = types[nameIndex];

            var obj = { "type": type, "name": name };
            bulk.insert(obj);
            count++;

            if ( count % 1000 == 0 ) {
              process.stderr.write('insert count: ' + count + "\n");
              bulk.execute(function(err,resp) {
                bulk = Thing.collection.initializeUnorderedBulkOp();
                callback(err);
              });
            } else {
              callback();
            }

          },
          callback
        );
      },

      function(callback) {
        writer.write("{ \n  \"page\": 1,\n  \"pageSize\": 25,\n")
        writer.write("  \"facets\":  {\n");      // open object response

        var stream = Thing.collection.aggregate(
          [
            { "$group": {
              "_id": "$name",
              "count": { "$sum": 1 }
            }}
          ],
          {
            "cursor": {
              "batchSize": 1000
            }
          }
        );

        var counter = 0;

        stream.on("data",function(data) {
          stream.pause();

          if ( counter == 0 ) {
            writer.write("    \"names\": [\n");
          } else {
            writer.write(",\n");
          }

          data = { "name": data._id, "count": data.count };

          writer.write("      " + JSON.stringify(data));

          counter++;
          stream.resume();
        });

        stream.on("end",function() {
          writer.write("\n    ],\n");

          var stream = Thing.collection.aggregate(
            [
              { "$group": {
                "_id": "$type",
                "count": { "$sum": 1 }
              }}
            ],
            {
              "cursor": {
                "batchSize": 1000
              }
            }
          );

          var counter = 0;
          stream.on("data",function(data) {
            stream.pause();

            if ( counter == 0 ) {
              writer.write("    \"types\": [\n");
            } else {
              writer.write(",\n");
            }

            data = { "name": data._id, "count": data.count };

            writer.write("      " + JSON.stringify(data));

            counter++;
            stream.resume();
          });

          stream.on("end",function() {
            writer.write("\n    ]\n  },\n");

            var stream = Thing.find({}).limit(25).stream();
            var counter = 0;

            stream.on("data",function(data) {
              stream.pause();
              if ( counter == 0 ) {
                writer.write("  \"data\": [\n");
              } else {
                writer.write(",\n");
              }

              writer.write("    " + JSON.stringify(data));

              counter++;
              stream.resume();

            });

            stream.on("end",function() {
                writer.write("\n  ]\n}\n");
                callback();
            });

          });

        });
      }
    ],
    function(err) {
      if (err) throw err;
      process.exit();
    }
  );
});

这里有一些注意事项,特别是mongoose .aggregate()并不真正直接支持标准节点流接口。 .cursor()可以在聚合方法上使用{ "page": 1, "pageSize": 25, "facets": { "names": [ {"name":"Red","count":10007}, {"name":"Tiger","count":10012}, {"name":"Yellow","count":10119}, {"name":"Monkey","count":9970}, {"name":"Elephant","count":10046}, {"name":"Bear","count":10082}, {"name":"Orange","count":9982}, {"name":"Green","count":10005}, {"name":"Blue","count":9884}, {"name":"Lion","count":9893} ], "types": [ {"name":"colours","count":49997}, {"name":"animals","count":50003} ] }, "data": [ {"_id":"55bf141f3edc150b6abdcc02","type":"animals","name":"Lion"}, {"_id":"55bf141f3edc150b6abdc81b","type":"colours","name":"Blue"}, {"_id":"55bf141f3edc150b6abdc81c","type":"colours","name":"Orange"}, {"_id":"55bf141f3edc150b6abdc81d","type":"animals","name":"Bear"}, {"_id":"55bf141f3edc150b6abdc81e","type":"animals","name":"Elephant"}, {"_id":"55bf141f3edc150b6abdc81f","type":"colours","name":"Orange"}, {"_id":"55bf141f3edc150b6abdc820","type":"colours","name":"Green"}, {"_id":"55bf141f3edc150b6abdc821","type":"animals","name":"Lion"}, {"_id":"55bf141f3edc150b6abdc822","type":"animals","name":"Monkey"}, {"_id":"55bf141f3edc150b6abdc823","type":"colours","name":"Yellow"}, {"_id":"55bf141f3edc150b6abdc824","type":"colours","name":"Yellow"}, {"_id":"55bf141f3edc150b6abdc825","type":"colours","name":"Orange"}, {"_id":"55bf141f3edc150b6abdc826","type":"animals","name":"Monkey"}, {"_id":"55bf141f3edc150b6abdc827","type":"colours","name":"Blue"}, {"_id":"55bf141f3edc150b6abdc828","type":"animals","name":"Tiger"}, {"_id":"55bf141f3edc150b6abdc829","type":"colours","name":"Red"}, {"_id":"55bf141f3edc150b6abdc82a","type":"animals","name":"Monkey"}, {"_id":"55bf141f3edc150b6abdc82b","type":"animals","name":"Elephant"}, {"_id":"55bf141f3edc150b6abdc82c","type":"animals","name":"Tiger"}, {"_id":"55bf141f3edc150b6abdc82d","type":"animals","name":"Bear"}, {"_id":"55bf141f3edc150b6abdc82e","type":"colours","name":"Yellow"}, {"_id":"55bf141f3edc150b6abdc82f","type":"animals","name":"Lion"}, {"_id":"55bf141f3edc150b6abdc830","type":"animals","name":"Elephant"}, {"_id":"55bf141f3edc150b6abdc831","type":"colours","name":"Orange"}, {"_id":"55bf141f3edc150b6abdc832","type":"animals","name":"Elephant"} ] } 方法,但&#34;流&#34; core API method隐含的内容在此处提供了更多控制权,因此.each()此处获取基础driver object的方法更为可取。希望未来的mongoose版本会考虑这一点。

所以,如果你的最终目标是这样的&#34;方面数&#34;除了这里展示的结果,然后每个聚合和结果最有意义的是&#34; stream&#34;在所示的方式。如果没有这个,聚合变得过于复杂,并且很可能超过BSON限制,就像在这种情况下的其他情况一样。