Mongodb分组数据 - mapReduce还是聚合?

时间:2015-12-03 12:08:42

标签: mongodb mapreduce mongodb-query aggregation-framework

我有这样的文件:

{
    "_id" : ObjectId("565e906bc2209d91c4357b59"),
    "userEmail" : "abc@example.com",
    "subscription" : {
        "project1" : {
            "subscribed" : false
        },
        "project2" : {
            "subscribed" : true
        }
    }
}

{
    "_id" : ObjectId("565e906bc2209d91c4357b59"),
    "userEmail" : "mno@example.com",
    "subscription" : {
        "project1" : {
            "subscribed" : true
        },
        "project2" : {
            "subscribed" : true
        },
        "project3" : {
            "subscribed" : true
        }
    }
}

我希望用户将subscribed标记设置为true的项目列表分组。

例如,我期待的是:

abc@example.com - project2
mno@example.com - project1,project2,project3

我将有一个cron作业,它会将订阅项目的相应详细信息发送到相应的邮件ID。

我尝试了聚合,但聚合需要指定密钥。在我的情况下,键(Project1,Project2等)是动态的。所以我在某个地方阅读mapReduce是正确的选择。但我没有使用mapReduce的经验。

请帮助我解决这个问题,并让我理解如何处理这个问题的更多解释。

1 个答案:

答案 0 :(得分:1)

根据您的文档的当前结构,您需要使用mapReduce

db.subscription.mapReduce(
    function() {
        var project = [];
        for (key in this.subscription) {     
            if(Object.prototype.hasOwnProperty.call(this.subscription, key) && this.subscription[key]['subscribed']) 
            { project.push(key); } 
        } 
        emit(this.userEmail, project); 
    }, 
    function(key, values) {}, 
    { out: {'inline': 1 } }
)

返回:

{
        "results" : [
                {
                        "_id" : "abc@example.com",
                        "value" : [
                                "project2"
                        ]
                },
                {
                        "_id" : "mno@example.com",
                        "value" : [
                                "project1",
                                "project2",
                                "project3"
                        ]
                }
        ],
        "timeMillis" : 28,
        "counts" : {
                "input" : 2,
                "emit" : 2,
                "reduce" : 0,
                "output" : 2
        },
        "ok" : 1
}

您应该考虑更改文档结构。为此,您需要使用"bulk"操作更新文档并更改“订阅”和子文档数组,以实现最高效率。

var bulk = db.subscription.initializeOrderedBulkOp();
var count = 0;

db.subscription.find().forEach(function(doc) {
    var newSubscriptions = [];
    var subscription = doc.subscription;
    for (key in subscription) {     
            if(Object.prototype.hasOwnProperty.call(subscription, key)) 
                newSubscriptions.push( { 'project': key, 'subscribed': subscription[key]['subscribed'] });
    }
    bulk.find( { '_id': doc._id } ).updateOne( {
        '$set': { subscription': newSubscriptions }
    });
    count++;
    if (count % 500 === 0) {
        bulk.execute();
        db.subscription.initializeOrderedBulkOp();
    }
})

// clean up queues
if (count > 0) 
    bulk.execute();

完成此操作后,您的文档如下所示:

{
        "_id" : ObjectId("566041212729b51edb5871d4"),
        "userEmail" : "abc@example.com",
        "subscription" : [
                {
                        "project" : "project1",
                        "subscribed" : false
                },
                {
                        "project" : "project2",
                        "subscribed" : true
                }
        ]
}
{
        "_id" : ObjectId("565e906bc2209d91c4357b59"),
        "userEmail" : "mno@example.com",
        "subscription" : [
                {
                        "project" : "project1",
                        "subscribed" : true
                },
                {
                        "project" : "project2",
                        "subscribed" : true
                },
                {
                        "project" : "project3",
                        "subscribed" : true
                }
        ]
}

您可以使用.aggregate()方法来访问聚合管道:

db.subscription.aggregate([
    { '$project': {
        'userEmail': 1,
        'projects': {
            '$setDifference': [
                { '$map': {
                    'input': '$subscription', 
                    'as': 'srpt',
                    'in': { '$cond': [ '$$srpt.subscribed', '$$srpt.project', false ] } 
                }}, 
                [false]
            ]
        }
    }}
])

哪个收益率:

{
        "_id" : ObjectId("566041212729b51edb5871d4"),
        "userEmail" : "abc@example.com",
        "projects" : [
                "project2"
        ]
}
{
        "_id" : ObjectId("565e906bc2209d91c4357b59"),
        "userEmail" : "mno@example.com",
        "projects" : [
                "project1",
                "project2",
                "project3"
        ]
}