MongoDB MapReduce导致"错误:查询期间出错"

时间:2014-06-19 17:27:41

标签: mongodb mapreduce

我有一个文档集合,使用类似这样的模式(一些成员编辑):

{
    "_id" : ObjectId("539f41a95d1887b57ab78bea"),
    "answers" : {
        "ratings" : {
            "positivity" : [ 
                2, 
                3, 
                5
            ],
            "activity" : [ 
                4, 
                4, 
                3
            ],
    },
    "media" : [ 
        ObjectId("537ea185df872bb71e4df270"), 
        ObjectId("537ea185df872bb71e4df275"), 
        ObjectId("537ea185df872bb71e4df272")
    ]
}

在此架构中,第一个,第二个和第三个positivity评级分别对应media数组中的第一个,第二个和第三个条目。 activity评级也是如此。我需要针对集合中所有文档中关联的positivity对象计算activitymedia评级的统计信息。现在,我正在为第一个具有以下MapReduce设置的条目这样做:

var mapFunction = function() {
    var activity = {
        sum: this.answers.ratings.activity[0],
        min: this.answers.ratings.activity[0],
        max: this.answers.ratings.activity[0],
        count: 1,
        diff: 0
    };

    var positivity = {
        sum: this.answers.ratings.positivity[0],
        min: this.answers.ratings.positivity[0],
        max: this.answers.ratings.positivity[0],
        count: 1,
        diff: 0
    };

    emit(this.media[0].str, {'activity': activity, 'positivity': positivity});
}

var reduceFunction = function(key, values) {

    var activityA = values[0].activity; // will reduce into here
    for (var i = 1; i < values.length; i++) {

        var activityB = values[i].activity; // will merge 'b' into 'a'

        // temp helpers
        var delta = activityA.sum/activityA.count - activityB.sum/activityB.count; // a.mean - b.mean
        var weight = (activityA.count * activityB.count)/(activityA.count + activityB.count);

        // do the reducing
        activityA.diff += activityB.diff + delta*delta*weight;
        activityA.sum += activityB.sum;
        activityA.count += activityB.count;
        activityA.min = Math.min(activityA.min, activityB.min);
        activityA.max = Math.max(activityA.max, activityB.max);
    }

    var positivityA = values[0].positivity; // will reduce into here
    for (var i = 1; i < values.length; i++) {

        var positivityB = values[i].positivity; // will merge 'b' into 'a'

        // temp helpers
        var delta = positivityA.sum/positivityA.count - positivityB.sum/positivityB.count; // a.mean - b.mean
        var weight = (positivityA.count * positivityB.count)/(positivityA.count + positivityB.count);

        // do the reducing
        positivityA.diff += positivityB.diff + delta*delta*weight;
        positivityA.sum += positivityB.sum;
        positivityA.count += positivityB.count;
        positivityA.min = Math.min(positivityA.min, positivityB.min);
        positivityA.max = Math.max(positivityA.max, positivityB.max);
    }

    return {'activity': activityA, 'positivity': positivityA};
}

var finalizeFunction = function(key, value) {
    value.activity.mean = value.activity.sum / value.activity.count;
    value.activity.population_variance = value.activity.diff / value.activity.count;
    value.activity.population_std = Math.sqrt(value.activity.population_variance);
    value.activity.sample_variance = value.activity.diff / (value.activity.count - 1);
    value.activity.sample_std = Math.sqrt(value.activity.sample_variance);

    value.positivity.mean = value.positivity.sum / value.positivity.count;
    value.positivity.population_variance = value.positivity.diff / value.positivity.count;
    value.positivity.population_std = Math.sqrt(value.positivity.population_variance);
    value.positivity.sample_variance = value.positivity.diff / (value.positivity.count - 1);
    value.positivity.sample_std = Math.sqrt(value.positivity.sample_variance);
    return value;
}

var limitingQuery = {'answers.ratings.activity':{$exists:true},'answers.ratings.positivity':{$exists:true}}
db.trials.mapReduce(mapFunction, reduceFunction, {query: limitingQuery, finalize: finalizeFunction, out: {replace: 'base_ratings', db: 'tmp'}});

使用少量文档,这一切都符合我的预期。当我针对整个系列运行它时,会发生一些奇怪的事情。首先,当我运行db.currentOp()时,我得到以下输出:

{
        "inprog" : [
                {
                        "opid" : 2337,
                        "active" : true,
                        "secs_running" : 2787,
                        "microsecs_running" : NumberLong("2787597940"),
                        "op" : "query",
                        "ns" : "eim.trials",
                        "query" : {
                                "$msg" : "query not recording (too large)"
                        },
                        "planSummary" : "COLLSCAN",
                        "client" : "109.201.154.152:59939",
                        "desc" : "conn17",
                        "threadId" : "0x7ef89b022700",
                        "connectionId" : 17,
                        "locks" : {
                                "^" : "r",
                                "^eim" : "R"
                        },
                        "waitingForLock" : false,
                        "msg" : "m/r: (1/3) emit phase M/R: (1/3) Emit Progress: 8300/1 830000%",
                        "progress" : {
                                "done" : 8300,
                                "total" : 1
                        },
                        "numYields" : 1133,
                        "lockStats" : {
                                "timeLockedMicros" : {
                                        "r" : NumberLong("5075753298"),
                                        "w" : NumberLong(2274)
                                },
                                "timeAcquiringMicros" : {
                                        "r" : NumberLong(243155328),
                                        "w" : NumberLong(131)
                                }
                        }
                },
                {
                        "opid" : 2480,
                        "active" : true,
                        "secs_running" : 2111,
                        "microsecs_running" : NumberLong(2111502538),
                        "op" : "query",
                        "ns" : "eim.trials",
                        "query" : {
                                "$msg" : "query not recording (too large)"
                        },
                        "planSummary" : "COLLSCAN",
                        "client" : "109.201.154.192:61609",
                        "desc" : "conn23",
                        "threadId" : "0x7ef89ac1e700",
                        "connectionId" : 23,
                        "locks" : {
                                "^" : "r",
                                "^eim" : "R"
                        },
                        "waitingForLock" : false,
                        "msg" : "m/r: (1/3) emit phase M/R: (1/3) Emit Progress: 7952/1 795200%",
                        "progress" : {
                                "done" : 7952,
                                "total" : 1
                        },
                        "numYields" : 819,
                        "lockStats" : {
                                "timeLockedMicros" : {
                                        "r" : NumberLong("3399905661"),
                                        "w" : NumberLong(73184)
                                },
                                "timeAcquiringMicros" : {
                                        "r" : NumberLong(406543723),
                                        "w" : NumberLong(145)
                                }
                        }
                }
        ]
}

Emit Progress大于100%的情况是什么?我知道,当这一切都在运行时,其他文档没有被添加到集合中。而inprog.progress.done8300,而inprog.progress.total1。咦?

更糟糕的是,此操作最终会因错误而失败:

Error: error doing query: failed src/mongo/shell/query.js:78
即使在报告错误之后,

db.currentOp()仍继续返回与上述类似的结果。日志根本没有显示任何错误(仅指示此命令仍在运行的行):

2014-06-19T13:24:15.378-0400 [conn23]           M/R: (1/3) Emit Progress: 8400

任何想法可能会发生在这里?在Ubuntu 13.10上运行MongoDB 2.6.2。

2 个答案:

答案 0 :(得分:0)

不是答案(显然我没有足够的评论来评论),但我有一个类似的错误,并在MongoDB上提出了一个问题:https://jira.mongodb.org/browse/SERVER-15334所以如果你在google搜索时找到这个网页,就像我一样做了,然后按照那个链接(希望!)更多的细节来了(MongoDB通常非常擅长回来对这类事情很快就回来了)

答案 1 :(得分:-1)

有时在emit中进行一些验证检查有帮助。 不会说永远有帮助。

但有时我遇到这个错误,问题在于我正在发出的价值的质量。

尝试运行小块并检查它是否正常运行。