我为移动应用分析编写了一个服务器,其中我有一个分片(!Upd )集合,其中的事件如下:
{
"event": "install",
"userId": "a",
"time": 2014-02-09,
"data" : ...
},
{
"event": "login",
"userId": "a",
"time": 2014-02-12,
"data" : ...
},
{
"event": "install",
"userId": "b",
"time": 2014-4-29,
"data" : ...
},
{
"event": "login",
"userId": "b",
"time": 2014-4-30,
"data" : ...
}
...
我需要在install
事件后的第二天选择有login
而不是install
的用户(换句话说,我想选择安装应用的用户) ,但不要在第二天登录)。因此,上述数据的输出应为:
{
"userId": "a",
"data" : ...
}
如何使用聚合框架或mapreduce执行此任务?或者可能是另一种解决方案?
答案 0 :(得分:2)
这有点棘手:-) 如果时间只是一个日期字段(没有时间数据),你可以用聚合来做, 然后 有收藏
{
"_id" : ObjectId("57694365ef9176ec54960a66"),
"event" : "install",
"userId" : "a",
"time" : ISODate("2014-09-02T00:00:00.000Z")
},{
"_id" : ObjectId("57694365ef9176ec54960a67"),
"event" : "login",
"userId" : "a",
"time" : ISODate("2014-12-02T00:00:00.000Z")
},{
"_id" : ObjectId("57694365ef9176ec54960a68"),
"event" : "install",
"userId" : "b",
"time" : ISODate("2014-04-29T00:00:00.000Z")
},{
"_id" : ObjectId("57694365ef9176ec54960a69"),
"event" : "login",
"userId" : "b",
"time" : ISODate("2014-04-30T00:00:00.000Z")
}
我们可以使用聚合查询:
var match = {
$match : {
"event" : "install"
}
};
var projectNextDayDate = {
$project : {
_id : 1,
event : 1,
userId : 1,
time : 1,
nextDay : {
$add : ["$time", 24 * 60 * 60 * 1000]
}
}
}
var lookup = {
$lookup : {
from : "zella",
localField : "nextDay",
foreignField : "time",
as : "mergedDocs"
}
}
var nowMatchUsers = {
$project : {
_id : 1,
event : 1,
userId : 1,
time : 1,
nextDay : 1,
mergedDocs : {
$filter : {
input : "mergedDocs",
as : "m",
cond : {
$eq : ["$$m.userId", "$userId"]
}
}
}
}
}
var findEmptyArrays = {
$match : {
mergedDocs : []
}
}
db.zella.aggregate([match, projectNextDayDate, lookup, findEmptyArrays])
使用此输出:
{
"_id" : ObjectId("57694365ef9176ec54960a66"),
"event" : "install",
"userId" : "a",
"time" : ISODate("2014-09-02T00:00:00.000Z"),
"nextDay" : ISODate("2014-09-03T00:00:00.000Z"),
"mergedDocs" : []
}
这里假设时间是日期2014-09-02T00:00:00.000
合并集合的另一种方法是使用用户ID作为$lookup
点,但是会有更多的逻辑来过滤结果集,并且可以删除性能。
答案 1 :(得分:1)
您可以尝试运行以下聚合管道:
db.test.aggregate([
{
"$project": {
"event": 1,
"userId": 1,
"time": 1,
"data": 1,
"dayAfter": {
"$add": [ "$time", 24 * 60 * 60 * 1000 ]
}
}
},
{ "$match": { "event": { "$in": ["install", "login"] } } },
{
"$group": {
"_id": "$userId",
"eventsTimeLine": {
"$push": {
"event": "$event",
"time": "$time",
"dayAfter": "$dayAfter"
}
},
"data": { "$push": "$data" }
}
},
{ "$unwind": "$eventsTimeLine" },
{ "$sort": { "eventsTimeLine.event": 1 } },
{
"$group": {
"_id": "$_id",
"dayAfterInstall": { "$first": "$eventsTimeLine.dayAfter" },
"loginTime": { "$last": "$eventsTimeLine.time" },
"data": { "$first": "$data" }
}
},
{
"$project": {
"isChurn": { "$ne": [ "$loginTime", "$dayAfterInstall" ] },
"userId": "$_id", "data": 1, "_id": 0
}
},
{ "$match" : { "isChurn" : true } }
])
答案 2 :(得分:0)
这是另一个使用mapreduce和聚合的解决方案:
var mapFunction = function() {
if (this.event != 'install' && this.event != 'login'){
return;
}
var value = {data: this.data, count: 1};
if (this.event == 'install'){
var nextDay = new Date(this.date.getTime() + 24 * 60 * 60 * 1000)
emit({userId:this.userId, nextDayAfterInstall:nextDay}, value );
} else
if (this.event == 'login'){
emit({userId:this.userId, nextDayAfterInstall:this.date}, value );
}
};
var reduceFunction = function(event, values) {
var value = { data: null, count: 1 };
for (var index = 0; index < values.length; ++index) {
value.count += values[index].count;
value.data = values[index].data;
}
return value ;
};
db.events.mapReduce(
mapFunction,
reduceFunction,
{ out: "case1_mr_out" }
)
var groupByUserId = {
$group :
{
_id : { userId: "$_id.userId" },
data : { $last: '$value.data' },
count : { $max: '$value.count' }
}
}
var filterWhereOnlyOne = {
$match : {
"count" : 1
}
};
db.case1_mr_out.aggregate([groupByUserId,filterWhereOnlyOne])