MongoDB MR,在中途默默地删除记录

时间:2014-08-22 13:50:03

标签: mongodb mapreduce

我在mongodb上运行一个MR,它在我尝试对10000条记录进行非规范化操作时默默地删除记录,它发生在集合中间的某个地方(~5K)。我尝试在记录附近删除(排序时),数据没有任何问题。

这是我的代码

mapOrd = function() {
    var values = {
        customerId: this.customerId,
        orderNr: this.orderNr,
        productId: this.productId
    };
    emit(this.productId, values);
};

mapPrd = function() {
    var values = {
    code: this.code
    };
    emit(this.id, values);
};

reduceOrdPrd =  function(k, values) {
    var result = {};
    values.forEach(function(value) {
    var field;
        if ("orderNr" in value) {
            if (!("cust_ids" in result)) {
                result.cust_ids = [];
            }
            result.cust_ids.push(value);
        } else {
    for (field in value) {
        if (value.hasOwnProperty(field) ) {
                result[field] = value[field];
        }
         };  
       }
      });
       return result;
};



db.prd_ord.drop();

db.order_10000.mapReduce(mapOrd, reduceOrdPrd, {"out": {"reduce": "prd_ord"}, "sort": {"productId": 1}});

我在配置非常低的机器上运行它。 512 MB RAM,1GB SWAP内存。

但无论如何,它不应该默默地(随机地)丢弃组的元素。

对象不超过BSON对象限制。(每个键只有10到15个对象数组)。

有什么想法吗?

3 个答案:

答案 0 :(得分:0)

考虑一下密钥的第一次减少。它将在循环中执行以下代码,

if ("orderNr" in value) {
    if (!("cust_ids" in result)) {
        result.cust_ids = [];
    }
    result.cust_ids.push(value);
}

然后获取并返回

resut = {
    cust_ids : [ {
        customerId : customerId1,
        orderNr : orderNr1,
        productId : productId1
    }, {
        customerId : customerId2,
        orderNr : orderNr2,
        productId : productId2
    }, {
        customerId : customerId_n,
        orderNr : orderNr_n,
        productId : productId_n
    } ]
}

对于第二个减少on the same key,假设有2个元素具有如上所述的结构值;肯定会用代码处理

for (field in value) {
    if (value.hasOwnProperty(field)) {
        result[field] = value[field];
    }
}

因为它们具有相同的键" cust_ids" 所以只剩下最后一个键。之前的所有内容都将被覆盖和删除。

答案 1 :(得分:0)

感谢您的回复。

我的代码按设计工作。唯一的问题是在我的代码中的5500条记录之后,在特定点丢弃相同发射密钥的记录。例如productid = 553(发出键)有12个元素。它们在输入集合order_10000中以5502 nd记录结束。从MR的输出中减少了第5501和5502条记录。

输入Order_10000:

db.order_10000.find({productId:553}).pretty();
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3410"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000058)
}
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3411"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000059)
}
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3412"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000060)
}
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3413"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000061)
}
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3414"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000062)
}
{
       "_id" : ObjectId("53f75e2eb4e41522bccf3415"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderLineNr" : NumberLong(1),
       "orderNr" : NumberLong(6000063)
}
{
       "_id" : ObjectId("53f75e2eb4e41522bccf3416"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000064)
}
{
       "_id" : ObjectId("53f75e2eb4e41522bccf3417"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000065)
}
{
       "_id" : ObjectId("53f75e2eb4e41522bccf3418"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000066)
}
{
       "_id" : ObjectId("53f75e97b4e41522bccf3419"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000067)
}
{
       "_id" : ObjectId("53f75e97b4e41522bccf341a"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000068)
}
{
       "_id" : ObjectId("53f76127b4e41522bccf341e"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000071)
}

MR的输出:

db.prd_ord.find({'value.cust_ids.productId': 553}).pretty();
{
       "_id" : NumberLong(553),
       "value" : {
               "cust_ids" : [
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000068),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000067),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000066),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000065),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000064),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000063),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000062),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000061),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000060),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000059),
                               "productId" : NumberLong(553)
                       }
               ],
               "cust_ids_length" : 10
       }    
}

下一组从这里开始正常。如果某个特定组超过下一个阈值,也会发生同样的事情。

答案 2 :(得分:0)

对不起,我错过了你问题中的一些重要信息,例如(just 10 to 15 array of objects per key)。如果sort:{productId:1}基于asc / desc类型索引正常工作,则不应该发生奇怪的事情。但请检查是否有多次reduce请求相同的密钥,如果在您的代码设计上发生,则会失败。

我重新设计了以下程序,也许它可以帮到你。

mapOrd = function() {
    var values = {cust_ids: [{
        customerId: this.customerId,
        orderNr: this.orderNr,
        productId: this.productId
    }]};
    emit(this.productId, values);
};

reduceOrdPrd = function(k, values) {
    var result = [];
    values.forEach(function(value) {
        value.cust_ids.forEach(function(e) {
            result.push(e);
        });
    });
    return {cust_ids: result};
};

db.prd_ord.drop();

db.order_10000.mapReduce(mapOrd, reduceOrdPrd, {"out": {"reduce": "prd_ord"}, "sort": {"productId": 1}});

为了帮助您发现让您感到困惑的地方,我会重写您的map function,如下所示:

reduceOrdPrd = function(k, values) {
    var result = {};
    var reduceNum = 1;  // if it's larger than 1 in the last output, dropping is possible for this key.
    var reduceBatches = [values]; // record all batches of values used to call this reduce function. It would not lose data on this design.
    values.forEach(function(value) {
        if ("reduceNum" in value) {
            reduceNum += value.reduceNum;
            value.reduceBatches.forEach(function(e) {reduceBatches.push(e);});
        }
        var field;
        if ("orderNr" in value) {
            if (!("cust_ids" in result)) {
                result.cust_ids = [];
            }
            result.cust_ids.push(value);
        } else {
            for (field in value) {
                if (field != "reduceNum" && field != "reduceBatches") {
                    if (value.hasOwnProperty(field)) {
                        result[field] = value[field];
                    }
                }
            }
            ;
        }
    });
    result.reduceNum = reduceNum;
    result.reduceBatches = reduceBatches;
    return result;
};

上面不会更改您的原始设计,但会在每个键的value中添加两个字段。输出类似于:

db.prd_ord.find({'value.cust_ids.productId': 553}).pretty();
{
       "_id" : NumberLong(553),
       "value" : {
                "reduceNum":<num>,
                "reduceBatches": [[], [], ...]
               "cust_ids" : [
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000068),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000067),
                               "productId" : NumberLong(553)
                       },
                       ...
                ]
       }
}