我从一组看起来像这样的文件开始:
{
state: 'CA',
year: 2014,
accepted: true
}
{
state: 'AL',
year: 2012,
accepted: false
}
{
state: 'CA',
year: 2013,
accepted: false
}
...
我希望以这种格式结束新的聚合集合:
{
_id: 'CA',
value: {
submittedApplications2012: 34,
submittedApplications2013: 23,
submittedApplications2014: 72,
acceptedApplications2012: 12,
acceptedApplications2013: 7,
acceptedApplications2014: 5
}
}
{
_id: 'AL',
value: {
submittedApplications2012: 73,
submittedApplications2013: 67,
submittedApplications2014: 98,
acceptedApplications2012: 45,
acceptedApplications2013: 34,
acceptedApplications2014: 31
}
}
我编写了一个mapreduce,它按状态名称对文档进行分组,并循环遍历每个状态,增加相应的属性:
var map = function() {
var key = this.state;
var value = {
year: this.year,
accepted: this.accepted
};
emit(key, value);
};
var reduce = function(key, values) {
var reducedObject = {
submittedApplications2012: 0,
submittedApplications2013: 0,
submittedApplications2014: 0,
acceptedApplications2012: 0,
acceptedApplications2013: 0,
acceptedApplications2014: 0
};
values.forEach(function(v) {
switch (v.year) {
case 2014:
reducedObject.submittedApplications2014++;
if (v.accepted) {
reducedObject.acceptedApplications2014++;
}
break;
case 2013:
reducedObject.submittedApplications2013++;
if (v.accepted) {
reducedObject.acceptedApplications2013++;
}
break;
case 2012:
reducedObject.submittedApplications2012++;
if (v.accepted) {
reducedObject.acceptedApplications2012++;
}
break;
default:
}
});
return reducedObject;
};
db.test_collection.mapReduce(
map,
reduce,
{out: {inline: 1}}
)
不幸的是,结果不准确。对于submitted2012
,submitted2013
和submitted2014
,阿拉巴马州最终得到9,8和3。其他州也以低数字结束。有10,000条记录,数字应该更高。
我认为这种情况正在发生,因为reduce函数被多次调用(参见Reduce is called several times with the same key in mongodb map-reduce),reducedObject
对象在后续传递中被覆盖。
如何防止这种情况,以便准确计算提交和接受的申请数量?
以下是一些以原始格式创建测试集合的代码:
// Generate a test collection with 10K documents for demo'ing purposes
var i = 10000,
states = ['AL', 'CA', 'FL', 'TN', 'OH'],
years = [2012, 2013, 2014];
db.test_collection.drop();
while (i--) {
db.test_collection.insert({
state: states[Math.floor(Math.random() * states.length)],
year: NumberInt(years[Math.floor(Math.random() * years.length)]),
accepted: Math.random() >= 0.5
});
}
答案 0 :(得分:1)
我真的不认为mapReduce是正确的选择。 Peronally我将使用聚合框架,因为它将在这里处理得更快,因为操作都是本机代码,没有代码或对象的JavaScript转换。
这样做只是一个简单的$group
操作,$cond
会对true/false
值进行一些处理,将db.test_collection.aggregate([
{ "$group": {
"_id": {
"state": "$state",
"year": "$year"
},
"submitted": { "$sum": 1 },
"accepted": {
"$sum": {
"$cond": [
"$accepted",
1,
0
]
}
}
}},
{ "$group": {
"_id": "$_id.state",
"values": {
"$push": {
"year": "$_id.year",
"submitted": "$submitted",
"accepted": "$accepted"
}
}
}}
])
值转换为数字:
{
"_id" : "CA",
"values" : [
{
"year" : 2014,
"submitted" : 691,
"accepted" : 360
},
{
"year" : 2013,
"submitted" : 653,
"accepted" : 332
},
{
"year" : 2012,
"submitted" : 681,
"accepted" : 350
}
]
}
产生这样的输出(为简洁起见,只有一个状态):
db.test_collection.aggregate([
{ "$group": {
"_id": "$state",
"submitted2012": {
"$sum": {
"$cond": [
{ "$eq": [ "$year", 2012 ] },
1,
0
]
}
},
"accepted2012": {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$year", 2012 ] },
"$accepted"
]},
1,
0
]
}
},
"submitted2013": {
"$sum": {
"$cond": [
{ "$eq": [ "$year", 2013 ] },
1,
0
]
}
},
"accepted2013": {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$year", 2013 ] },
"$accepted"
]},
1,
0
]
}
},
"submitted2014": {
"$sum": {
"$cond": [
{ "$eq": [ "$year", 2014 ] },
1,
0
]
}
},
"accepted2014": {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$year", 2014 ] },
"$accepted"
]},
1,
0
]
}
}
}}
])
或者,如果您确实必须在输出中指定所有键,请使用以下表单。在代码中生成结构作为"数据结构"是一件很重要的事情。是一个聚合管道或实际上任何本机MongoDB查询实际上是:
var groupStage = {
"$group": {
"_id": "$state"
}
};
[2012,2013,2014].forEach(function(year) {
groupStage["$group"]["submitted" + year] = {
"$sum": {
"$cond": [
{ "$eq": [ "$year", year ] },
1,
0
]
}
};
groupStage["$group"]["accepted" + year] = {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$year", year ] },
"$accepted"
]},
1,
0
]
}
};
});
db.test_collection.aggregate([groupStage])
事实上,它实际上就像这样微不足道:
{
"_id" : "CA",
"submitted2012" : 681,
"accepted2012" : 350,
"submitted2013" : 653,
"accepted2013" : 332,
"submitted2014" : 691,
"accepted2014" : 360
}
它的输出:
db.test_collection.mapReduce(
function() {
var obj = {};
obj["submitted" + this.year] = 1,
obj["accepted" + this.year] = (this.accepted) ? 1: 0;
emit(this.state,obj);
},
function(key,values) {
var obj = {};
values.forEach(function(value) {
Object.keys(value).forEach(function(key) {
if ( !obj.hasOwnProperty(key) )
obj[key] = 0;
obj[key] += value[key];
});
});
return obj;
},
{ "out": { "inline": 1 } }
)
使用mapReduce执行此操作要慢得多,但您需要做的主要考虑因素是使用"映射器"发出与reducer本身将返回相同的输出。这是因为"减速机"实际上并不是一次处理所有分组文档,而是输出"减少"可以反过来作为"输入"与其他排放的和#34;减少的"进一步减少的价值:
{
"_id" : "CA",
"value" : {
"submitted2014" : 691,
"accepted2014" : 360,
"submitted2013" : 653,
"accepted2013" : 332,
"submitted2012" : 681,
"accepted2012" : 350
}
}
有了这种输出:
db.test_collection.mapReduce(
function() {
var obj = {
"year": this.year,
"submitted": 1,
"accepted": (this.accepted) ? 1 : 0
};
emit(this.state,{ "values": [obj] });
},
function(key,values) {
var obj = { "values": [] };
var accum = {};
values.forEach(function(value) {
value.values.forEach(function(data) {
if ( !accum.hasOwnProperty(data.year) )
accum[data.year] = {
submitted: 0,
accepted: 0
};
accum[data.year]["submitted"] += data.submitted;
accum[data.year]["accepted"] += data.accepted;
});
});
Object.keys(accum).forEach(function(key) {
obj.values.push({
"year": parseInt(key),
"submitted": accum[key].submitted,
"accepted": accum[key].accepted
});
});
obj.values.sort(function(a,b){
return a.year < b.year;
});
return obj;
},
{ "out": { "inline": 1 } }
)
对于记录,可以像这样获得类似原始聚合示例的输出:
{
"_id" : "CA",
"value" : {
"values" : [
{
"year" : 2014,
"submitted" : 691,
"accepted" : 360
},
{
"year" : 2013,
"submitted" : 653,
"accepted" : 332
},
{
"year" : 2012,
"submitted" : 681,
"accepted" : 350
}
]
}
}
使用mapReduce规则后面的输出键:
var i = 10000,
states = ['AL', 'CA', 'FL', 'TN', 'OH'],
years = [2012, 2013, 2014],
bulk = db.test_collection.initializeOrderedBulkOp();
db.test_collection.drop();
while (i--) {
bulk.insert({
state: states[Math.floor(Math.random() * states.length)],
year: NumberInt(years[Math.floor(Math.random() * years.length)]),
accepted: Math.random() >= 0.5
});
if ( i % 1000 == 0 ) {
bulk.execute();
bulk = db.test_collection.initializeOrderedBulkOp();
}
}
因此可以使用mapReduce,但聚合框架无疑是此类任务的更好选择。
使用Bulk操作时,您的生成脚本也可能更好一些:
class Task extends Model
{
public function category() {
return $this->belongsTo('Category');
}
public function scopeCategoryType($query, $cat_type) {
$query->whereHas('category', function($q) use($cat_type) {
$q->where('type', $cat_type);
}
}
}
class Category extends Model
{
public function tasks() {
return $this->hasMany('Task');
}
public function scopeType($query, $cat_type) {
$query->where('type', $cat_type);
}
}
答案 1 :(得分:0)
您可以在Aggregation运算符的帮助下实现:
让我们创建一个以下集合:
收藏:
db.flat.insert({state: "CA",year: 2014,accepted: true});
db.flat.insert({state: "AL",year: 2012,accepted: false});
db.flat.insert({state: "CA",year: 2013,accepted: false});
db.flat.insert({state: "AL",year: 2012,accepted: true});
db.flat.insert({state: "CA",year: 2011,accepted: false});
db.flat.insert({state: "AL",year: 2011,accepted: true});
db.flat.insert({state: "CA",year: 2013,accepted: false});
db.flat.insert({state: "AL",year: 2014,accepted: true});
db.flat.insert({state: "CA",year: 2014,accepted: false});
db.flat.insert({state: "AL",year: 2014,accepted: true});
db.flat.insert({state: "CA",year: 2014,accepted: false});
db.flat.insert({state: "AL",year: 2014,accepted: true});
db.flat.insert({state: "CA",year: 2014,accepted: false});
在查询下方将获取所需的输出:
查询:
db.flat.aggregate([
{ $group: { _id: { state:"$state",year:"$year" },
submitted:{ $sum : 1 },
accepted: { $sum :
{
$cond: { if: { $eq:[ "$accepted",true ] },
then: 1 ,
else: 0
}
}
}
}
},
{ $project: { temp: {
$concat: [ "submittedApplications",
{ $substr:[ "$_id.year", 0, 4 ] }," : ",
{ $substr:[ "$submitted",0,1 ] }, " , " ,
{
$concat:[ "acceptedApplications",
{ $substr: [ "$_id.year",0,4 ] }," : ",
{ $substr: [ "$accepted",0,1 ] }
]
}
]
}
}
},
{ $group: { _id:"$_id.state" , value : {$push:"$temp"}
}
}
]).pretty();
OutPut:
{
"_id" : "CA",
"value" : [
"submittedApplications2011 : 1 , acceptedApplications2011 : 0",
"submittedApplications2013 : 2 , acceptedApplications2013 : 0",
"submittedApplications2014 : 4 , acceptedApplications2014 : 1"
]
}
{
"_id" : "AL",
"value" : [
"submittedApplications2011 : 1 , acceptedApplications2011 : 1",
"submittedApplications2012 : 2 , acceptedApplications2012 : 1",
"submittedApplications2014 : 3 , acceptedApplications2014 : 3"
]
}