我在mongodb上运行一个MR,它在我尝试对10000条记录进行非规范化操作时默默地删除记录,它发生在集合中间的某个地方(~5K)。我尝试在记录附近删除(排序时),数据没有任何问题。
这是我的代码
mapOrd = function() {
var values = {
customerId: this.customerId,
orderNr: this.orderNr,
productId: this.productId
};
emit(this.productId, values);
};
mapPrd = function() {
var values = {
code: this.code
};
emit(this.id, values);
};
reduceOrdPrd = function(k, values) {
var result = {};
values.forEach(function(value) {
var field;
if ("orderNr" in value) {
if (!("cust_ids" in result)) {
result.cust_ids = [];
}
result.cust_ids.push(value);
} else {
for (field in value) {
if (value.hasOwnProperty(field) ) {
result[field] = value[field];
}
};
}
});
return result;
};
db.prd_ord.drop();
db.order_10000.mapReduce(mapOrd, reduceOrdPrd, {"out": {"reduce": "prd_ord"}, "sort": {"productId": 1}});
我在配置非常低的机器上运行它。 512 MB RAM,1GB SWAP内存。
但无论如何,它不应该默默地(随机地)丢弃组的元素。
对象不超过BSON对象限制。(每个键只有10到15个对象数组)。
有什么想法吗?
答案 0 :(得分:0)
考虑一下密钥的第一次减少。它将在循环中执行以下代码,
if ("orderNr" in value) {
if (!("cust_ids" in result)) {
result.cust_ids = [];
}
result.cust_ids.push(value);
}
然后获取并返回
resut = {
cust_ids : [ {
customerId : customerId1,
orderNr : orderNr1,
productId : productId1
}, {
customerId : customerId2,
orderNr : orderNr2,
productId : productId2
}, {
customerId : customerId_n,
orderNr : orderNr_n,
productId : productId_n
} ]
}
对于第二个减少on the same key
,假设有2个元素具有如上所述的结构值;肯定会用代码处理
for (field in value) {
if (value.hasOwnProperty(field)) {
result[field] = value[field];
}
}
因为它们具有相同的键" cust_ids" 所以只剩下最后一个键。之前的所有内容都将被覆盖和删除。
答案 1 :(得分:0)
感谢您的回复。
我的代码按设计工作。唯一的问题是在我的代码中的5500条记录之后,在特定点丢弃相同发射密钥的记录。例如productid = 553(发出键)有12个元素。它们在输入集合order_10000中以5502 nd记录结束。从MR的输出中减少了第5501和5502条记录。
输入Order_10000:
db.order_10000.find({productId:553}).pretty();
{
"_id" : ObjectId("53f75e2ab4e41522bccf3410"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000058)
}
{
"_id" : ObjectId("53f75e2ab4e41522bccf3411"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000059)
}
{
"_id" : ObjectId("53f75e2ab4e41522bccf3412"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000060)
}
{
"_id" : ObjectId("53f75e2ab4e41522bccf3413"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000061)
}
{
"_id" : ObjectId("53f75e2ab4e41522bccf3414"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000062)
}
{
"_id" : ObjectId("53f75e2eb4e41522bccf3415"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderLineNr" : NumberLong(1),
"orderNr" : NumberLong(6000063)
}
{
"_id" : ObjectId("53f75e2eb4e41522bccf3416"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000064)
}
{
"_id" : ObjectId("53f75e2eb4e41522bccf3417"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000065)
}
{
"_id" : ObjectId("53f75e2eb4e41522bccf3418"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000066)
}
{
"_id" : ObjectId("53f75e97b4e41522bccf3419"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000067)
}
{
"_id" : ObjectId("53f75e97b4e41522bccf341a"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000068)
}
{
"_id" : ObjectId("53f76127b4e41522bccf341e"),
"customerId" : NumberLong(5699),
"productId" : NumberLong(553),
"orderNr" : NumberLong(6000071)
}
MR的输出:
db.prd_ord.find({'value.cust_ids.productId': 553}).pretty();
{
"_id" : NumberLong(553),
"value" : {
"cust_ids" : [
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000068),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000067),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000066),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000065),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000064),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000063),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000062),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000061),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000060),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000059),
"productId" : NumberLong(553)
}
],
"cust_ids_length" : 10
}
}
下一组从这里开始正常。如果某个特定组超过下一个阈值,也会发生同样的事情。
答案 2 :(得分:0)
对不起,我错过了你问题中的一些重要信息,例如(just 10 to 15 array of objects per key)
。如果sort:{productId:1}
基于asc / desc类型索引正常工作,则不应该发生奇怪的事情。但请检查是否有多次reduce请求相同的密钥,如果在您的代码设计上发生,则会失败。
我重新设计了以下程序,也许它可以帮到你。
mapOrd = function() {
var values = {cust_ids: [{
customerId: this.customerId,
orderNr: this.orderNr,
productId: this.productId
}]};
emit(this.productId, values);
};
reduceOrdPrd = function(k, values) {
var result = [];
values.forEach(function(value) {
value.cust_ids.forEach(function(e) {
result.push(e);
});
});
return {cust_ids: result};
};
db.prd_ord.drop();
db.order_10000.mapReduce(mapOrd, reduceOrdPrd, {"out": {"reduce": "prd_ord"}, "sort": {"productId": 1}});
为了帮助您发现让您感到困惑的地方,我会重写您的map function
,如下所示:
reduceOrdPrd = function(k, values) {
var result = {};
var reduceNum = 1; // if it's larger than 1 in the last output, dropping is possible for this key.
var reduceBatches = [values]; // record all batches of values used to call this reduce function. It would not lose data on this design.
values.forEach(function(value) {
if ("reduceNum" in value) {
reduceNum += value.reduceNum;
value.reduceBatches.forEach(function(e) {reduceBatches.push(e);});
}
var field;
if ("orderNr" in value) {
if (!("cust_ids" in result)) {
result.cust_ids = [];
}
result.cust_ids.push(value);
} else {
for (field in value) {
if (field != "reduceNum" && field != "reduceBatches") {
if (value.hasOwnProperty(field)) {
result[field] = value[field];
}
}
}
;
}
});
result.reduceNum = reduceNum;
result.reduceBatches = reduceBatches;
return result;
};
上面不会更改您的原始设计,但会在每个键的value
中添加两个字段。输出类似于:
db.prd_ord.find({'value.cust_ids.productId': 553}).pretty();
{
"_id" : NumberLong(553),
"value" : {
"reduceNum":<num>,
"reduceBatches": [[], [], ...]
"cust_ids" : [
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000068),
"productId" : NumberLong(553)
},
{
"customerId" : NumberLong(5699),
"orderNr" : NumberLong(6000067),
"productId" : NumberLong(553)
},
...
]
}
}