我一直在学习Riak并遇到了mapReduce的问题。我的mapReduce函数在有15条记录时工作正常,但在此之后,它会抛出堆栈跟踪错误。我是Riak和Erlang的新手,所以我不确定它是我的代码还是它的Riak。关于如何调试此问题或者感谢问题的任何建议!
-module(identity_map).
-export([identity/3]).
identity(Values, _, _) ->
JsonData = riak_object:get_values(Values),
{struct, Objects} = mochijson2:decode(JsonData),
[Objects].
-module(stocks_summary).
-export([average_high/2]).
% Returns values from a map phase
average_high(Values, _) ->
Total = lists:foldl(
fun(Record, Accum) ->
High = proplists:get_value(<<"high_d">>, Record),
Accum + High
end,
0,
Values),
[Total / length(Values)].
curl -XPOST http://192.168.0.126:8098/mapred \
-H 'Content-Type: application/json' \
-d '{"inputs": ["stocks","goog"], "query": [{"map":{"language":"erlang","module":"identity_map","function":"identity"}}, {"reduce":{"language":"erlang","module":"stocks_summary","function":"average_high"}} ]}'
Date,Open,High,Low,Close,Volume,Adj Close
2010-05-05,500.98,515.72,500.47,509.76,4566900,509.76
2010-05-04,526.52,526.74,504.21,506.37,6076300,506.37
2010-05-03,526.50,532.92,525.08,530.60,1857800,530.60
2010-04-30,531.13,537.68,525.44,525.70,2435400,525.70
2010-04-29,533.37,536.50,526.67,532.00,3058900,532.00
2010-04-28,532.10,534.83,521.03,529.19,3406100,529.19
2010-04-27,528.95,538.33,527.23,529.06,3844700,529.06
2010-04-26,544.97,544.99,529.21,531.64,4368800,531.64
2010-04-23,547.25,549.32,542.27,544.99,2089400,544.99
2010-04-22,552.00,552.50,543.35,547.06,3280700,547.06
2010-04-21,556.46,560.25,552.16,554.30,2391500,554.30
2010-04-20,554.17,559.66,551.06,555.04,2977400,555.04
2010-04-19,548.75,553.99,545.00,550.10,3894000,550.10
2010-04-16,563.00,568.81,549.63,550.15,12235500,550.15
2010-04-15,592.17,597.84,588.29,595.30,6716700,595.30
2010-04-14,590.06,592.34,584.01,589.00,3402700,589.00
2010-04-13,572.53,588.88,571.13,586.77,3845200,586.77
2010-04-12,567.35,574.00,566.22,572.73,2352400,572.73
2010-04-09,567.49,568.77,564.00,566.22,2056600,566.22
2010-04-08,563.32,569.85,560.05,567.49,1947500,567.49
2010-04-07,567.30,568.75,561.86,563.54,2581000,563.54
{
"date_s": "2010-04-07",
"open_d": 567.3,
"high_d": 568.75,
"low_d": 561.86,
"close_d": 563.54,
"volume_i": 2581000,
"adjClose_d": 563.54
}
{
"phase": 1,
"error": "{function_clause,[{proplists,get_value,[<<\"high_d\">>,555.621,undefined],[{file,\"proplists.erl\"},{line,225}]},{stocks_summary,'-average_high/2-fun-0-',2,[{file,\"stocks_summary.erl\"},{line,9}]},{lists,foldl,3,[{file,\"lists.erl\"},{line,1248}]},{stocks_summary,average_high,2,[{file,\"stocks_summary.erl\"},{line,7}]},{riak_kv_w_reduce,reduce,3,[{file,\"src/riak_kv_w_reduce.erl\"},{line,207}]},{riak_kv_w_reduce,done,1,[{file,\"src/riak_kv_w_reduce.erl\"},{line,170}]},{riak_pipe_vnode_worker,wait_for_input,...},...]}",
"input": null,
"type": null,
"stack": null
}
从堆栈跟踪看,似乎reduce正在应用于一个值,而不是一个元组列表,但是这个问题很奇怪的是,当我只将10-15条记录放入存储桶时,它运行正常。
答案 0 :(得分:2)
问题在于减少阶段。映射阶段遍布集群并由许多vnode执行,这些vnode将映射阶段结果转发到将运行reduce的节点。由于这些不期望同时到达,因此减少阶段函数可以多次运行,其后续运行的输入是先前减少结果和新到达的映射阶段结果的串联。
这意味着在第二次运行时,您的reduce函数将前一个avarage作为普通数字作为列表中的第一个元素接收,其余列表是您期望的json对象/ proplists。
要解决此问题,请使用reduce函数返回包含当前平均值和到目前为止看到的值的数量的proplist。下面是一种可能性,但是这个例子会将MapReduce的最终结果作为对象/ proplist而不是数字返回。
average_high(Values, _) ->
{Count,Total} = lists:foldl(
fun(Record, {Cnt,Tot}) ->
case proplists:get_value(<<"average">>,Record,undefined) of
undefined ->
High = proplists:get_value(<<"high_d">>, Record),
{Cnt+1,Tot + High};
Ave ->
C = proplists:get_value(<<"count">>, Record, 1),
{Cnt + C, Tot + (Ave * C)}
end
end,
{0,0},
Values),
[[{<<"average">>,Total/Count},{<<"count">>,Count}]].
reduce函数应该生成一个值列表,但它也必须是 确实,该函数是可交换的,关联的和幂等的。 也就是说,如果输入列表[a,b,c,d]对于给定的F有效,那么全部 以下内容必须产生相同的结果:
F([a,b,c,d]) F([a,d] ++ F([c,b])) F([F([a]),F([c]),F([b]),F([d])])