Question

我有一个大约168,200,000个文档的mongo db集合。我试图用$ group获取某个字段的平均值，并且我在管道中的$ group之前使用$ match来使用client.city上的索引。但查询运行大约需要5分钟，这非常慢。

以下是我尝试过的事情：

db.ar12.aggregate(
    {$match:{'client.city':'New York'}},
    {'$group':{'_id':'client.city', 'avg':{'$avg':'$length'}}}
)

db.ar12.aggregate(
    {$match:{'client.city':'New York'}},
    {'$group':{'_id':null, 'avg':{'$avg':'$length'}}}
)

db.ar12.aggregate(
    {$match:{'client.city':'New York'}}, 
    {$project: {'length':1}},
    {'$group':{'_id':null, 'avg':{'$avg':'$length'}}}
)

所有3个查询大约需要同一时间，client.city =到纽约的文档数量为1,231,672，find({'client.city':'New York').count()需要一秒钟才能运行

> db.version()
  3.2.0

修改

这里是解释结果......对于添加长度的复合索引的注释，这会有所帮助，虽然我不是按长度搜索我想要所有长度......

{
"waitedMS" : NumberLong(0),
"stages" : [
    {
        "$cursor" : {
            "query" : {
                "client.city" : "New York"
            },
            "fields" : {
                "length" : 1,
                "_id" : 1
            },
            "queryPlanner" : {
                "plannerVersion" : 1,
                "namespace" : "clients.ar12",
                "indexFilterSet" : false,
                "parsedQuery" : {
                    "client.city" : {
                        "$eq" : "New York"
                    }
                },
                "winningPlan" : {
                    "stage" : "FETCH",
                    "inputStage" : {
                        "stage" : "IXSCAN",
                        "keyPattern" : {
                            "client.city" : 1
                        },
                        "indexName" : "client.city_1",
                        "isMultiKey" : false,
                        "isUnique" : false,
                        "isSparse" : false,
                        "isPartial" : false,
                        "indexVersion" : 1,
                        "direction" : "forward",
                        "indexBounds" : {
                            "client.city" : [
                                "[\"New York\", \"New York\"]"
                            ]
                        }
                    }
                },
                "rejectedPlans" : [ ]
            }
        }
    },
    {
        "$project" : {
            "length" : true
        }
    },
    {
        "$group" : {
            "_id" : {
                "$const" : null
            },
            "total" : {
                "$avg" : "$length"
            }
        }
    }
],
"ok" : 1
}

编辑2

我已添加了client.city和length的复合索引，但无效但速度仍然太慢，我尝试了这两个查询：

db.ar12.aggregate(
    {$match: {'client.city':'New York'}}, 
    {$project: {'client.city':1, 'length':1}},
    {'$group':{'_id':'$client.city', 'avg':{'$avg':'$length'}}}
)

上面的查询并没有使用复合索引，所以我尝试使用它来强制使用它，但仍然没有改变：

db.ar12.aggregate(
    {$match: { $and : [{'client.city':'New York'}, {'length':{'$gt':0}}]}}, 
    {$project: {'client.city':1, 'length':1}},
    {'$group':{'_id':'$client.city', 'avg':{'$avg':'$length'}}}
)

下面是最后一个查询的解释：

{
"waitedMS" : NumberLong(0),
"stages" : [
    {
        "$cursor" : {
            "query" : {
                "$and" : [
                    {
                        "client.city" : "New York"
                    },
                    {
                        "length" : {
                            "$gt" : 0
                        }
                    }
                ]
            },
            "fields" : {
                "client.city" : 1,
                "length" : 1,
                "_id" : 1
            },
            "queryPlanner" : {
                "plannerVersion" : 1,
                "namespace" : "clients.ar12",
                "indexFilterSet" : false,
                "parsedQuery" : {
                    "$and" : [
                        {
                            "client.city" : {
                                "$eq" : "New York"
                            }
                        },
                        {
                            "length" : {
                                "$gt" : 0
                            }
                        }
                    ]
                },
                "winningPlan" : {
                    "stage" : "CACHED_PLAN",
                    "inputStage" : {
                        "stage" : "FETCH",
                        "inputStage" : {
                            "stage" : "IXSCAN",
                            "keyPattern" : {
                                "client.city" : 1,
                                "length" : 1
                            },
                            "indexName" : "client.city_1_length_1",
                            "isMultiKey" : false,
                            "isUnique" : false,
                            "isSparse" : false,
                            "isPartial" : false,
                            "indexVersion" : 1,
                            "direction" : "forward",
                            "indexBounds" : {
                                "client.city" : [
                                    "[\"New York\", \"New York\"]"
                                ],
                                "length" : [
                                    "(0.0, inf.0]"
                                ]
                            }
                        }
                    }
                },
                "rejectedPlans" : [ ]
            }
        }
    },
    {
        "$project" : {
            "client" : {
                "city" : true
            },
            "length" : true
        }
    },
    {
        "$group" : {
            "_id" : "$client.city",
            "avg" : {
                "$avg" : "$length"
            }
        }
    }
],
"ok" : 1
}

Answer 1

我找到了一个工作，长度从1到70.所以我所做的是在python中我从1到70迭代，并找到每个城市的每个长度的计数，

db.ar12.find({'client.city':'New York', 'length':i}).count()

这是非常快的，然后在python中计算平均值，运行大约需要2秒。

这不是最好的解决方案，因为我还有其他查询要运行，我不知道我是否可以为所有这些查找... ...

Mongo $ group太慢

1 个答案: