按日/秒逻辑汇总分析数据

时间:2016-06-21 13:32:33

标签: mongodb aggregation-framework

我为移动应用分析编写了一个服务器,其中我有一个分片(!Upd )集合,其中的事件如下:

{
  "event": "install",
  "userId": "a",
  "time": 2014-02-09,
  "data" : ...
},
{
  "event": "login",
  "userId": "a",
  "time": 2014-02-12,
  "data" : ...
},
{
  "event": "install",
  "userId": "b",
  "time": 2014-4-29,
  "data" : ...
},
{
  "event": "login",
  "userId": "b",
  "time": 2014-4-30,
  "data" : ...
}
...

我需要在install事件后的第二天选择有login而不是install的用户(换句话说,我想选择安装应用的用户) ,但不要在第二天登录)。因此,上述数据的输出应为:

{
   "userId": "a",
   "data" : ...
}

如何使用聚合框架或mapreduce执行此任务?或者可能是另一种解决方案?

3 个答案:

答案 0 :(得分:2)

这有点棘手:-) 如果时间只是一个日期字段(没有时间数据),你可以用聚合来做, 然后 有收藏

{
    "_id" : ObjectId("57694365ef9176ec54960a66"),
    "event" : "install",
    "userId" : "a",
    "time" : ISODate("2014-09-02T00:00:00.000Z")
},{
    "_id" : ObjectId("57694365ef9176ec54960a67"),
    "event" : "login",
    "userId" : "a",
    "time" : ISODate("2014-12-02T00:00:00.000Z")
},{
    "_id" : ObjectId("57694365ef9176ec54960a68"),
    "event" : "install",
    "userId" : "b",
    "time" : ISODate("2014-04-29T00:00:00.000Z")
},{
    "_id" : ObjectId("57694365ef9176ec54960a69"),
    "event" : "login",
    "userId" : "b",
    "time" : ISODate("2014-04-30T00:00:00.000Z")
}

我们可以使用聚合查询:

var match = {
    $match : {
        "event" : "install"
    }
};

var projectNextDayDate = {
    $project : {
        _id : 1,
        event : 1,
        userId : 1,
        time : 1,
        nextDay : {
            $add : ["$time", 24 * 60 * 60 * 1000]
        }

    }
}

var lookup = {
    $lookup : {
        from : "zella",
        localField : "nextDay",
        foreignField : "time",
        as : "mergedDocs"
    }
}

var nowMatchUsers = {
    $project : {
        _id : 1,
        event : 1,
        userId : 1,
        time : 1,
        nextDay : 1,
        mergedDocs : {
            $filter : {
                input : "mergedDocs",
                as : "m",
                cond : {

                    $eq : ["$$m.userId", "$userId"]
                }
            }
        }
    }
}

var findEmptyArrays = {
    $match : {
        mergedDocs : []
    }
}
db.zella.aggregate([match, projectNextDayDate, lookup, findEmptyArrays])

使用此输出:

{
    "_id" : ObjectId("57694365ef9176ec54960a66"),
    "event" : "install",
    "userId" : "a",
    "time" : ISODate("2014-09-02T00:00:00.000Z"),
    "nextDay" : ISODate("2014-09-03T00:00:00.000Z"),
    "mergedDocs" : []
}

这里假设时间是日期2014-09-02T00:00:00.000 合并集合的另一种方法是使用用户ID作为$lookup点,但是会有更多的逻辑来过滤结果集,并且可以删除性能。

答案 1 :(得分:1)

您可以尝试运行以下聚合管道:

db.test.aggregate([
    {
        "$project": {
            "event": 1,
            "userId": 1,
            "time": 1,
            "data": 1,
            "dayAfter": {
                "$add": [ "$time", 24 * 60 * 60 * 1000 ]
            }
        }
    },
    { "$match": { "event": { "$in": ["install", "login"] } } },
    {
        "$group": {
            "_id": "$userId",
            "eventsTimeLine": {
                "$push": {
                    "event": "$event",
                    "time": "$time",
                    "dayAfter": "$dayAfter"
                }
            },
            "data": { "$push": "$data" }
        }
    },
    { "$unwind": "$eventsTimeLine" },
    { "$sort": { "eventsTimeLine.event": 1 } },
    {
        "$group": {
            "_id": "$_id",
            "dayAfterInstall": { "$first": "$eventsTimeLine.dayAfter" },
            "loginTime": { "$last": "$eventsTimeLine.time" },
            "data": { "$first": "$data" }
        }
    },
    { 
        "$project": { 
            "isChurn": { "$ne": [ "$loginTime", "$dayAfterInstall" ] },
            "userId": "$_id", "data": 1, "_id": 0 
        } 
    },
    { "$match" : { "isChurn" : true } }
])

答案 2 :(得分:0)

这是另一个使用mapreduce和聚合的解决方案:

var mapFunction = function() {

                      if (this.event != 'install' && this.event != 'login'){
                        return;
                      }

                      var value = {data: this.data, count: 1};

                      if (this.event == 'install'){

                         var nextDay = new Date(this.date.getTime() + 24 * 60 * 60 * 1000)

                         emit({userId:this.userId, nextDayAfterInstall:nextDay}, value );
                      } else
                      if (this.event == 'login'){
                          emit({userId:this.userId, nextDayAfterInstall:this.date}, value );
                      }


                   };


var reduceFunction = function(event, values) {

                        var value = { data: null, count: 1 };

                         for (var index = 0; index < values.length; ++index) {
                            value.count += values[index].count;
                            value.data = values[index].data;
                         }

                         return value ;
                      };           

db.events.mapReduce(
                     mapFunction,
                     reduceFunction,
                     { out: "case1_mr_out" }
                   )         


var groupByUserId = {
    $group : 
    { 
        _id : { userId: "$_id.userId" },
        data : { $last: '$value.data' },
        count : { $max: '$value.count' }
    } 
}       

var filterWhereOnlyOne = {
    $match : {
        "count" : 1
    }
};            

db.case1_mr_out.aggregate([groupByUserId,filterWhereOnlyOne])