Question

这个问题来自（正如我常常做的那样）仔细阅读SO所提出的问题，因此，为我自己提出另一个问题。因此除了学习练习在努力寻找问题的解决方案之外，我发现另一个问题会出现，比如这个。

OP尚未接受original question，并且实际上尚未澄清“他们”想要实现的目标。但我确实以简单和长形式提供了我的解释。

最后，这个过程让我想知道，考虑到解决方案的长形式，下一个（目前期待的2.6）MongoDB版本是否会引入一些新功能，使用已经引入的其他聚合运算符。

所以案件如下：

样本文件

{
    "tracked_item_type" : "Software",
    "tracked_item_name" : "Word",
    "duration" : 9540
}
{
    "tracked_item_type" : "Software",
    "tracked_item_name" : "Excel",
    "duration" : 4000
}
{
    "tracked_item_type" : "Software",
    "tracked_item_name" : "Notepad",
    "duration" : 4000
}
{
    "tracked_item_type" : "Site",
    "tracked_item_name" : "Facebook",
    "duration" : 7920
}
{
    "tracked_item_type" : "Site",
    "tracked_item_name" : "Twitter",
    "duration" : 5555
}
{
    "tracked_item_type" : "Site",
    "tracked_item_name" : "Digital Blasphemy",
    "duration" : 8000
}

期望结果

每种类型的前两名结果，按总持续时间排序。即使这是一个小样本，持续时间也被认为是许多项目的 $ sum 。

{ 
    "tracked_item_type": "Site",
    "tracked_item_name": "Digital Blasphemy",
    "duration" : 8000
}
{ 
    "tracked_item_type": "Site",
    "tracked_item_name": "Facebook",
    "duration" : 7920
}
{ 
    "tracked_item_type": "Software",
    "tracked_item_name": "Word",
    "duration" : 9540
}
{ 
    "tracked_item_type": "Software",
    "tracked_item_name": "Notepad",
    "duration" : 4000
}

聚合解决方案

这是解决问题的冗长方法

db.collection.aggregate([

    // Group on the types and "sum" of duration
    {"$group": {
        "_id": {
            "tracked_item_type": "$tracked_item_type",
            "tracked_item_name": "$tracked_item_name"
         },
        "duration": {"$sum": "$duration"}
    }},

    // Sort by type and duration descending
    {"$sort": { "_id.tracked_item_type": 1, "duration": -1 }},

    /* The fun part */

    // Re-shape results to "sites" and "software" arrays 
    {"$group": { 
        "_id": null,
        "sites": {"$push":
            {"$cond": [
                {"$eq": ["$_id.tracked_item_type", "Site" ]},
                { "_id": "$_id", "duration": "$duration" },
                null
            ]}
        },
        "software": {"$push":
            {"$cond": [
                {"$eq": ["$_id.tracked_item_type", "Software" ]},
                { "_id": "$_id", "duration": "$duration" },
                null
            ]}
        }
    }},


    // Remove the null values for "software"
    {"$unwind": "$software"},
    {"$match": { "software": {"$ne": null} }},
    {"$group": { 
        "_id": "$_id",
        "software": {"$push": "$software"}, 
        "sites": {"$first": "$sites"} 
    }},

    // Remove the null values for "sites"
    {"$unwind": "$sites"},
    {"$match": { "sites": {"$ne": null} }},
    {"$group": { 
        "_id": "$_id",
        "software": {"$first": "$software"},
        "sites": {"$push": "$sites"} 
    }},


    // Project out software and limit to the *top* 2 results
    {"$unwind": "$software"},
    {"$project": { 
        "_id": 0,
        "_id": { "_id": "$software._id", "duration": "$software.duration" },
        "sites": "$sites"
    }},
    {"$limit" : 2},


    // Project sites, grouping multiple software per key, requires a sort
    // then limit the *top* 2 results
    {"$unwind": "$sites"},
    {"$group": {
        "_id": { "_id": "$sites._id", "duration": "$sites.duration" },
        "software": {"$push": "$_id" }
    }},
    {"$sort": { "_id.duration": -1 }},
    {"$limit": 2}

])

“还没有”输出

聚合未达到最终结果的点。至少我目前的理解。

{
    "result" : [
        {
            "_id" : {
                "_id" : {
                    "tracked_item_type" : "Site",
                    "tracked_item_name" : "Digital Blasphemy"
                 },
                 "duration" : 8000
           },
            "software" : [
                {
                    "_id" : {
                        "tracked_item_type" : "Software",
                        "tracked_item_name" : "Word"
                    },
                    "duration" : 9540
                },

                {
                    "_id" : {
                        "tracked_item_type" : "Software",
                        "tracked_item_name" : "Notepad"
                    },
                    "duration" : 4000
                }
            ]
        },
        {
            "_id" : {
                "_id" : {
                    "tracked_item_type" : "Site",
                    "tracked_item_name" : "Facebook"
                },
                "duration" : 7920
            },
            "software" : [
                {
                    "_id" : {
                        "tracked_item_type" : "Software",
                        "tracked_item_name" : "Word"
                    },
                    "duration" : 9540
                },
                {
                    "_id" : {
                        "tracked_item_type" : "Software",
                        "tracked_item_name" : "Notepad"
                    },
                    "duration" : 4000
                }
            ]
        }
    ],
    "ok" : 1
}

这一切似乎都非常合理（对我来说），结果虽然不是完整，但可以在代码中进行后处理以便按摩它成为所需的形式。

但实际上，这似乎是一个练习，并且通过使用任何即将推出的功能进行聚合（或可能是另一个）来实现是否可以实现这一点我找不到想要的结果形式的技术。

所以请随意回答任何关于如何实现这一目标的建议/指示。

Answer 1

这是一个聚合，在每个类别中按持续时间查找前两个（它确实打破了“关系”，这似乎与您的示例输出一致）：

var pregroup = { "$group" : {
        "_id" : {
            "type" : "$tracked_item_type",
            "name" : "$tracked_item_name"
        },
        "duration" : {
            "$sum" : "$duration"
        }
    }
};
var sort = { "$sort" : { "_id.type" : 1, "duration" : -1 } };
var group1 = { "$group" : {
        "_id" : "$_id.type",
        "num1" : {
            "$first" : {
                "name" : "$_id.name",
                "dur" : "$duration"
            }
        },
        "other" : {
            "$push" : {
                "name" : "$_id.name",
                "dur" : "$duration"
            }
        },
    "all" : {
        "$push" : {
            "name" : "$_id.name",
            "dur" : "$duration"
        }
    }
    }
};
var unwind = { "$unwind" : "$other" };
project = {
    "$project" : {
        "keep" : {
            "$ne" : [
                "$num1.name",
                "$other.name"
            ]
        },
        "num1" : 1,
        "all" : 1,
        "other" : 1
    }
};
var match = { "$match" : { "keep" : true } };
var sort2 = { "$sort" : { "_id" : 1, "other.dur" : -1 } };
var group2 = { "$group" : {
        "_id" : "$_id",
        "numberOne" : {
            "$first" : "$num1"
        },
        "numberTwo" : {
            "$first" : "$other"
        },
    "all" : {
        "$first" : "$all"
    }
    }
};
unwind2 = { "$unwind" : "$all" };
project2 = { "$project" : {
    "_id" : 0,
    "tracked_item_type" : "$_id",
    "tracked_item_name" : {
        "$cond" : [
            {
                "$or" : [
                    {
                        "$eq" : [
                            "$all.name",
                            "$numberOne.name"
                        ]
                    },
                    {
                        "$eq" : [
                            "$all.name",
                            "$numberTwo.name"
                        ]
                    }
                ]
            },
            "$all.name",
            null
        ]
    },
    "duration" : {
        "$cond" : [
            {
                "$or" : [
                    {
                        "$eq" : [
                            "$all.name",
                            "$numberOne.name"
                        ]
                    },
                    {
                        "$eq" : [
                            "$all.name",
                            "$numberTwo.name"
                        ]
                    }
                ]
            },
            "$all.dur",
            null
        ]
    }
}
}
match2 = { "$match" : { "tracked_item_name" : { "$ne" : null } } };

使用您的示例数据运行它：

db.top2.aggregate(pregroup, sort, group1, unwind, project, match, sort2, group2, unwind2, project2, match2).toArray()
[
    {
        "tracked_item_type" : "Software",
        "tracked_item_name" : "Word",
        "duration" : 9540
    },
    {
        "tracked_item_type" : "Software",
        "tracked_item_name" : "Notepad",
        "duration" : 4000
    },
    {
        "tracked_item_type" : "Site",
        "tracked_item_name" : "Digital Blasphemy",
        "duration" : 8000
    },
    {
        "tracked_item_type" : "Site",
        "tracked_item_name" : "Facebook",
        "duration" : 7920
    }
]

这适用于任意数量的域（不同的跟踪项类型值），您无需事先知道所有域名。然而，将它推广到前三，前四，前五等将为每个额外的顶部“N”值增加四个阶段 - 不是很实用或漂亮。

请vote up this jira ticket在聚合框架中获得更多原生的“前N”功能。

Answer 2

首先得到我自己的答案！

我没想到，但有一个答案可以在 2.6 的实施中找到，其中包括一些很棒的新Set Operators。

所以我想到的（最终）是问题归结为两个单独的列表需要一个，所以如何合并< / strong>这些项目使它们都属于一个字段。所以运算符 $setUnion 。

首先是新片段，我将在部分解释：

// So this part just "normalizes" a little so we get one record that essentially has // two arrays in it {"$group": { _id: { _id: null, software: "$software" }, sites: {$push:"$_id" } }},

结果文件：

{ "_id" : { "_id" : null, "software" : [ { "_id" : { "tracked_item_type" : "Software", "tracked_item_name" : "Word" }, "duration" : 9540 }, { "_id" : { "tracked_item_type" : "Software", "tracked_item_name" : "Notepad" }, "duration" : 4000 } ] }, "sites" : [ { "_id" : { "tracked_item_type" : "Site", "tracked_item_name" : "Digital Blasphemy" }, "duration" : 8000 }, { "_id" : { "tracked_item_type" : "Site", "tracked_item_name" : "Facebook" }, "duration" : 7920 } ] }

因此该文档可以说是结果的更好形式，而不是我之前离开的地方，考虑到项目不再重复，基本上有两个列表，我们希望合并为一个。因此，现在要做的就是使用促进此合并的运算符：

// Then we just project with a new field, and the "$setUnion" of the two arrays {"$project": { "_id": 0, "records": {"$setUnion": ["$_id.software", "$sites"]} }},

这就把我们带到了这里：

{ "records" : [ { "_id" : { "tracked_item_type" : "Site", "tracked_item_name" : "Facebook" }, "duration" : 7920 }, { "_id" : { "tracked_item_type" : "Software", "tracked_item_name" : "Word" }, "duration" : 9540 }, { "_id" : { "tracked_item_type" : "Site", "tracked_item_name" : "Digital Blasphemy" }, "duration" : 8000 }, { "_id" : { "tracked_item_type" : "Software", "tracked_item_name" : "Notepad" }, "duration" : 4000 } ] }

基本上就是这样。现在我们只有四个项目，所以只需要一点“展开”，投影和排序，我们就会得到我正在寻找的确切结果。

所以这就是整个事情，只是为了记录：

db.collection.aggregate([ // Group on the types and "sum" of duration {"$group": { "_id": { "tracked_item_type": "$tracked_item_type", "tracked_item_name": "$tracked_item_name" }, "duration": {"$sum": "$duration"} }}, // Sort by type and duration descending {"$sort": { "_id.tracked_item_type": 1, "duration": -1 }}, /* The fun part */ // Re-shape results to "sites" and "software" arrays {"$group": { "_id": null, "sites": {"$push": {"$cond": [ {"$eq": ["$_id.tracked_item_type", "Site" ]}, { "_id": "$_id", "duration": "$duration" }, null ]} }, "software": {"$push": {"$cond": [ {"$eq": ["$_id.tracked_item_type", "Software" ]}, { "_id": "$_id", "duration": "$duration" }, null ]} } }}, // Remove the null values for "software" {"$unwind": "$software"}, {"$match": { "software": {"$ne": null} }}, {"$group": { "_id": "$_id", "software": {"$push": "$software"}, "sites": {"$first": "$sites"} }}, // Remove the null values for "sites" {"$unwind": "$sites"}, {"$match": { "sites": {"$ne": null} }}, {"$group": { "_id": "$_id", "software": {"$first": "$software"}, "sites": {"$push": "$sites"} }}, // Project out software and limit to the *top* 2 results {"$unwind": "$software"}, {"$project": { "_id": 0, "_id": { "_id": "$software._id", "duration": "$software.duration" }, "sites": "$sites" }}, {"$limit" : 2}, // Project sites, grouping multiple software per key, requires a sort // then limit the *top* 2 results {"$unwind": "$sites"}, {"$group": { "_id": { "_id": "$sites._id", "duration": "$sites.duration" }, "software": {"$push": "$_id" } }}, {"$sort": { "_id.duration": -1 }}, {"$limit": 2}, // So this part just "normalizes" a little so we get one record that // essentially has two arrays in it {"$group": { _id: { _id: null, software: "$software" }, sites: {$push:"$_id" } }}, // Then we just project with a new field, and the "$setUnion" of the two arrays {"$project": { "_id": 0, "records": {"$setUnion": ["$_id.software", "$sites"]} }}, // Unwind the array to documents {"$unwind": "$records"}, // Shape the final output {"$project": { "tracked_item_type": "$records._id.tracked_item_type", "tracked_item_name": "$records._id.tracked_item_name", "duration": "$records.duration" }}, // Final sort on the result {"$sort": { "tracked_item_type": 1, "duration": -1 }} ])

显然有一个 drop-off 点，整个方法变得不切实际，因为一般前提是能够将$push所有文档按顺序放入自己的数组中顶部结果可以被拉出，最终会在这些结果上调用$limit。

因此，对于每个“类别”，大的结果数量，那么处理“可能是一种更实用的方法类别“，然后只需将这些结果中的每个限制为所需的前两个项目。

但作为练习，至少我现在知道可以完成。希望这对某人有用。

我仍然有兴趣看看是否有人有其他方法。

MongoDB文档重塑

样本文件

期望结果

聚合解决方案

“还没有”输出

2 个答案:

首先得到我自己的答案！