最初,我有一个关系,订单有很多lineitems,许多lineitems只有一个订单,像往常一样。
使用mongoDB,我做了这个文档来代表它:
{
"_id" : ObjectId("511b7d1b3daee1b1446ecdfe"),
"l_order" : {
"_id" : ObjectId("511b7d133daee1b1446eb54d"),
"o_orderkey" : NumberLong(1),
"o_totalprice" : 173665.47,
"o_orderdate" : ISODate("1996-01-02T03:00:00Z"),
"o_orderpriority" : "5-LOW",
"o_shippriority" : 0,
},
"l_linenumber" : 1,
"l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
"l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
"l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
}
我的意图是翻译这个SQL查询:
select
o_orderpriority,
count(*) as order_count
from
orders
where
o_orderdate >= date '1993-07-01'
and o_orderdate < date '1993-07-01' + interval '3' month
and exists (
select
*
from
lineitem
where
l_orderkey = o_orderkey
and l_commitdate < l_receiptdate
)
group by
o_orderpriority
order by
o_orderpriority;
为此,请使用两个mapreduce函数:
第一的
db.runCommand({
mapreduce: "lineitem",
query: {
"l_order.o_orderdate": {'$gte': new Date("July 01, 1993"), '$lt': new Date("Oct 01, 1993")}
},
map: function Map() {
if(this.l_commitdate < this.l_receiptdate){
emit( this.l_order.o_orderkey, this.l_order.o_orderpriority );
}
},
out: 'query004a'
});
第二
db.runCommand({
mapreduce: "query004a",
map: function Map() {
/*Remenbering, the value here will be this.l_order.o_orderpriority from the previous mapreduce function*/
emit( this.value, 1 );
},
reduce: function(key, values) {
return Array.sum(values);
},
out: 'query004b'
});
首先,我将文件分隔在日期范围内并尊重比较,将它们分组以避免重复。在第二个我将o_orderpriority和sum分组。
令我惊讶的是,答案比我预期的要大。但是为什么会发生这种情况?
答案 0 :(得分:0)
在你的第一个map函数中,你应该使用'oderpriority'作为键,'orderkey'作为值 - 这会将你的设置减少到你想要的第二个mapReduce中。 (您需要指定reduce函数,否则mapReduce将返回错误)。
所以,这可能是这样的:
OrderDateMin = new Date("1996-01-01");
OrderDateMax = new Date("1996-04-01");
// first where on oderdate
query = {
"l_order.o_orderdate": {$gte: OrderDateMin, $lt: OrderDateMax}
}
map1 = function() {
//second "where" on commitdate < receiptdate
if ( this.l_commitdate < this.l_receiptdate ) {
// emit orderpriority as key, "1" as counter
emit( this.l_order.o_orderpriority, this.l_order.o_orderkey );
}
};
reduce1 = function(key, values) {
return 1;
}
db.runCommand({
mapReduce: "xx",
query: query,
map: map1,
reduce: reduce1,
out: 'query004a',
})
map2 = function() {
//_id is ordepriority
emit( this._id, 1 );
};
reduce2 = function(key, values) {
// count entries per orderpriority
count = 0;
values.forEach( function(value) { count += value; } );
return count;
}
db.runCommand({
mapReduce: "query004a",
map: map2,
reduce: reduce2,
out: 'query004b',
})
现在,使用一个聚合命令可以实现同样的目标,这个命令更快(在C中实现,而不是在JavaScript中实现):
db.xx.aggregate([
// first "where", this will use an index, if defined
{ $match: {
"l_order.o_orderdate": { $gte: OrderDateMin, $lt: OrderDateMax }
}},
// reduce to needed fields, create a field for decision of second "where"
{ $project: {
"key": "$l_order.o_orderkey",
"pri": "$l_order.o_orderpriority",
okay: { $cond: [ {$lt: ["l_commitdate", "l_receiptdate"]}, 1, 0 ] }
}},
// select second where condition matched
{ $match: { "okay": 1 } },
// group by priority and key
{ $group: { _id: { "pri": "$pri", "key": "$key" } } },
// group by priority - count entries
{ $group: { _id: "$_id.pri", "count": { $sum: 1 } } },
])
会返回类似的内容:
{ "result" : [ { "_id" : "5-LOW", "count" : 1 } ], "ok" : 1 }
最后但并非最不重要:有关设计的建议:
如果您的结构是相反的方式会更简单:“订单”集合,其中订单项嵌入为项目数组。这样可以避免在整个集合中出现重复的订单数据。
更多信息:
http://docs.mongodb.org/manual/reference/command/mapReduce/#mapReduce
http://docs.mongodb.org/manual/reference/aggregation
这有帮助吗?
干杯
罗纳德