mongo聚合组,内部组/计数数组

时间:2015-09-15 09:46:33

标签: mongodb mongodb-query aggregation-framework

我对目前的Aggregate表达式感到困惑和困惑,我希望在Mongo本身提供一些输入或解决方案。

来自Mongo的原始数据(简化为我现在需要的字段):

[{
  'status': 'Cancelled',
  'CIC Package': 'Test Gallery Cafe',
},
{
  'status': 'Completed',
  'CIC Package': 'Design Thinking workshop'
},
{
  'status': 'Tentative',
  'CIC Package': 'Design Thinking workshop'
},
{
  'status': 'Confirmed',
  'CIC Package': 'Product / solution demonstration'
},

....etc
]  

一般来说......有大约1000个记录,可能是8个CIC包'具有不同的状态(已确认,已取消,暂定,已完成)以及我目前已排除的其他数据。

我正在寻找的最终结果是这样的:

    [{
    "_id" : "Test Gallery Café",
    "package" : "Test Gallery Café",
    "status" : [
      {
        "Cancelled": 1
      },
      {
        "Completed": 1
      }
    ]
  },
  {
    "_id" : "Design Thinking workshop",
    "package" : "Design Thinking workshop",
    "status" : [
      {
        "Cancelled": 3
      },
      {
        "Completed": 2
      }
    ]
  },
  {
    "_id" : "Product / solution demonstration",
    "package" : "Product / solution demonstration",
    "status" : [
      {

        "Completed": 10
      },
      {
        "Cancelled": 3
      },
      {
        "Confirmed": 1
      }
    ]
  }]

因此,在CIC package中我重命名为package的每个$group我想要计算数据集中存在的每个状态。状态和包裹不在我的控制之下,因此可以及时添加新的状态和包裹。它需要是一个充满活力的团体。

我到目前为止:

  db.reportData.aggregate([
  {
    $project: 
    {
      'CIC package': 1,
      'Status': 1
    }
   }
 , 
   {
     $group: 
     {
       _id: '$CIC package',
       package: 
       {
         $first: '$CIC package'
       }
     ,
       status:
       {
         $push: '$Status'
       }
     }
   }
 ]).toArray()

导致了这样的事情:

  [{
    "_id" : "Test Gallery Café",
    "package" : "Test Gallery Café",
    "status" : [
      "Cancelled",
      "Completed"
    ]
  },
  {
    "_id" : "Design Thinking workshop",
    "package" : "Design Thinking workshop",
    "status" : [
      "Cancelled",
      "Cancelled",
      "Cancelled",
      "Completed",
      "Completed"
    ]
  },
  {
    "_id" : "Product / solution demonstration",
    "package" : "Product / solution demonstration",
    "status" : [
      "Completed",
      "Completed",
      "Cancelled",
      "Processing",
      "Cancelled",
      "Cancelled",
      "Completed",
      "Completed",
      "Completed",
      "Completed",
      "Completed",
      "Completed",
      "Completed",
      "Completed",
      "Completed",
      "Tentative"
    ]
  }]

这是一个更大的集合的小提取,但它是迄今为止的结果的一个很好的样本。

我在最后unwind之后尝试了group,它确实创建了我可能再次group的新记录,但我现在感到有些困惑。也许我做得不够好。 我想我差不多了,但我会喜欢一些意见。

有什么想法吗?

2 个答案:

答案 0 :(得分:2)

你基本上似乎想要每种类型的“状态计数”,这是首先要对这些进行分组然后只计算对象的主要_id

db.reportData.aggregate([
    { "$unwind": "$status" },
    { "$group": {
        "_id": {
            "_id": "$_id",
            "package": "$package",
            "status": "$status"
        },
        "count": { "$sum": 1 }
    }},
    { "$group": {
        "_id": "$_id._id",
        "package": { "$first": "$_id.package" },
        "status": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.status", "Completed" ] },
                    { "Completed": "$count" },
                    { "$cond": [
                        { "$eq": [ "$_id.status", "Cancelled" ] },
                        { "Cancelled": "$count" },
                        { "$cond": [
                            { "$eq": [ "$_id.status", "Processing" ] },
                            { "Processing": "$count" },
                            { "Tentative": "$count" }
                        ]}
                    ]}
                ]
            }
        }
    }}
])

或者只是对结果中的每个状态使用“type”字段保持通用:

db.reportData.aggregate([
    { "$unwind": "$status" },
    { "$group": {
        "_id": {
            "_id": "$_id",
            "package": "$package",
            "status": "$status"
        },
        "count": { "$sum": 1 }
    }},
    { "$group": {
        "_id": "$_id._id",
        "package": { "$first": "$_id.package" },
        "status": {
            "$push": {
                "type": "$_id.status",
                "count": "$count"
            }
        }
    }}
])

这会给你这样的结果:

{
        "_id" : "Test Gallery Café",
        "package" : "Test Gallery Café",
        "status" : [
                {
                        "type" : "Completed",
                        "count" : 1
                },
                {
                        "type" : "Cancelled",
                        "count" : 1
                }
        ]
}
{
        "_id" : "Design Thinking workshop",
        "package" : "Design Thinking workshop",
        "status" : [
                {
                        "type" : "Completed",
                        "count" : 2
                },
                {
                        "type" : "Cancelled",
                        "count" : 3
                }
        ]
}
{
        "_id" : "Not specified",
        "package" : "Not specified",
        "status" : [
                {
                        "type" : "Processing",
                        "count" : 1
                },
                {
                        "type" : "Tentative",
                        "count" : 1
                },
                {
                        "type" : "Cancelled",
                        "count" : 3
                },
                {
                        "type" : "Completed",
                        "count" : 11
                }
        ]
}

在使用$filter的MongoDB的未来版本中,这会有所改善:

db.reportData.aggregate([
    { "$project": {
        "package": 1,
        "statusComplete": {
            "$size": {
                "$filter": {
                    "input": "$status",
                    "as": "el",
                    "cond": {
                        "$eq": [ "$$el", "Completed" ]
                    }
                }
            }
        },
        "statusCancelled": {
            "$size": {
                "$filter": {
                    "input": "$status",
                    "as": "el",
                    "cond": {
                        "$eq": [ "$$el", "Cancelled" ]
                    }
                }
            }
        }
    }}
])

因为它基本上是关于“计算数组中匹配的元素”,并且可以扩展最后一个实际产生与前者相同的数组结果,由于没有$unwind而导致开销更少。但当然这还没有发布,但它只是你将来能够做的一个例子。

同样为了记录,改变前原始帖子中提供的数据是:

{
        "_id" : "Test Gallery Café",
        "package" : "Test Gallery Café",
        "status" : [
                "Cancelled",
                "Completed"
        ]
}
{
        "_id" : "Design Thinking workshop",
        "package" : "Design Thinking workshop",
        "status" : [
                "Cancelled",
                "Cancelled",
                "Cancelled",
                "Completed",
                "Completed"
        ]
}
{
        "_id" : "Not specified",
        "package" : "Not specified",
        "status" : [
                "Completed",
                "Completed",
                "Cancelled",
                "Processing",
                "Cancelled",
                "Cancelled",
                "Completed",
                "Completed",
                "Completed",
                "Completed",
                "Completed",
                "Completed",
                "Completed",
                "Completed",
                "Completed",
                "Tentative"
        ]
}

答案 1 :(得分:1)

好,

在Blakes Seven的帮助下,我来解决方案。 以下查询似乎有效,并且基于我在第一个问题中发布的起始数据集。添加最后添加2组以创建我想要的结果。

      db.reportData.aggregate([
      {
        $project: {
          'CIC package': 1,
          'Start Date': 1,
          'Status': 1
        }
      }, 
      {
        $group: {
          _id: '$CIC package',
          package: {
            $first: '$CIC package'
          },
          status: {
            $push: '$Status'
          }
        }
      },
      {
        $unwind: '$status'
      },
      {
        $group:
        {
          _id: 
          {
            "_id": "$_id",
            "package": "$package",
            "status": "$status"
          },
          package: {
            $first: '$package'
          },
          status: {
            $first: '$status'
          },
          count:{
            $sum: 1
          }

        }

      },
      {
        $group:
        {
          _id: "$_id._id",
          package: {
            $first: "$_id.package"
          },
          status:
          {
            $push:
            {
              "status" : "$_id.status",
              "count": '$count'

            }
          }
        }
      }
    ]).toArray()

它会生成如下数据集:

[
      {
        "_id" : "Studio Canal",
        "package" : "Studio Canal",
        "status" : [
          {
            "status" : "Completed",
            "count" : 8
          },
          {
            "status" : "Cancelled",
            "count" : 2
          }
        ]
      },
      {
        "_id" : "Meeting / forum",
        "package" : "Meeting / forum",
        "status" : [
          {
            "status" : "Cancelled",
            "count" : 254
          },
          {
            "status" : "Completed",
            "count" : 275
          },
          {
            "status" : "Processing",
            "count" : 6
          },
          {
            "status" : "Tentative",
            "count" : 1
          },
          {
            "status" : "Confirmed",
            "count" : 6
          }
        ]
      },
      {
        "_id" : "Design Thinking workshop",
        "package" : "Design Thinking workshop",
        "status" : [
          {
            "status" : "Cancelled",
            "count" : 2
          }
        ]
      },
      {
        "_id" : "Test Gallery Café",
        "package" : "Test Gallery Café",
        "status" : [
          {
            "status" : "Cancelled",
            "count" : 1
          },
          {
            "status" : "Completed",
            "count" : 1
          }
        ]
      },
      {
        "_id" : "Not specified",
        "package" : "Not specified",
        "status" : [
          {
            "status" : "Completed",
            "count" : 124
          },
          {
            "status" : "Tentative",
            "count" : 1
          },
          {
            "status" : "Cancelled",
            "count" : 42
          },
          {
            "status" : "Confirmed",
            "count" : 4
          },
          {
            "status" : "Processing",
            "count" : 5
          }
        ]
      },
      {
        "_id" : "Customer / partner / special event",
        "package" : "Customer / partner / special event",
        "status" : [
          {
            "status" : "Tentative",
            "count" : 1
          },
          {
            "status" : "Cancelled",
            "count" : 145
          },
          {
            "status" : "Processing",
            "count" : 3
          },
          {
            "status" : "Completed",
            "count" : 284
          },
          {
            "status" : "Confirmed",
            "count" : 8
          }
        ]
      },
      {
        "_id" : "Product / solution demonstration",
        "package" : "Product / solution demonstration",
        "status" : [
          {
            "status" : "Tentative",
            "count" : 1
          },
          {
            "status" : "Confirmed",
            "count" : 4
          },
          {
            "status" : "Cancelled",
            "count" : 82
          },
          {
            "status" : "Completed",
            "count" : 130
          },
          {
            "status" : "Processing",
            "count" : 1
          }
        ]
      }
    ]

这就是我要找的东西。我现在必须检查数据是否正确但它看起来像。 现在唯一的问题是可以/我应该优化它。也许是明天的事情。