有一个很大的(不适合内存).json文件,其中包含以下内容:
[{
"doc_number": "xxx",
"other": "data"
}, {
"doc_number": "yyy",
"other": "data"
}, {
"doc_number": "zzz",
"other": "data"
}]
我想尽可能快地使用尽可能少的内存来阅读它。在其他语言中,我通常会创建一个懒惰的文件序列,并在必要时进行读取。我想知道Erlang是否有一种惯用的方法来实现这一目标。
答案 0 :(得分:3)
jsx可以用作增量解析器,但是对于您的数据格式,您必须编写自己的回调模块:
-module(jsx_increment).
-export([parse_file/1]).
-export([init/1, handle_event/2]).
parse_file(FN) ->
{ok, File} = file:open(FN, [read, raw, binary]),
read(File, jsx:decoder(?MODULE, [], [stream, return_tail])),
file:close(File).
read(File, JSX) ->
{ok, Data} = file:read(File, 8), %% eof should raise error
case JSX(Data) of
{incomplete, F} ->
read(File, F);
{with_tail, _, Tail} ->
Tail =/= <<>> andalso io:format("Surplus content: ~s~n", [Tail])
end.
init(_) ->
start.
handle_event(start_array, start) ->
[];
handle_event(_, start) ->
error(expect_array);
handle_event(start_object, L) ->
[start_object|L];
handle_event(start_array, L) ->
[start_array|L];
handle_event(end_object, L) ->
check_out(collect_object(L));
handle_event(end_array, []) ->
stop;
handle_event(end_array, L) ->
check_out(collect_array(L));
handle_event(E, L) ->
check_out([event(E)|L]).
check_out([X]) ->
io:format("Collected object: ~p~n", [X]),
[];
check_out(L) -> L.
event({_, X}) -> X;
event(X) -> X.
collect_object(L) ->
collect_object(L, #{}).
collect_object([start_object|T], M) ->
[M|T];
collect_object([V, K|T], M) ->
collect_object(T, M#{K => V}).
collect_array(L) ->
collect_array(L, []).
collect_array([start_array|T], L) ->
[L|T];
collect_array([H|T], L) ->
collect_array(T, [H|L]).
你的例子:
1> io:put_chars(element(2, file:read_file("data.json"))).
[{
"doc_number": "xxx",
"other": "data"
}, {
"doc_number": "yyy",
"other": "data"
}, {
"doc_number": "zzz",
"other": "data"
}]
ok
2> jsx_increment:parse_file("data.json").
Collected object: #{<<"doc_number">> => <<"xxx">>,<<"other">> => <<"data">>}
Collected object: #{<<"doc_number">> => <<"yyy">>,<<"other">> => <<"data">>}
Collected object: #{<<"doc_number">> => <<"zzz">>,<<"other">> => <<"data">>}
ok
这是概念代码的证明,你必须适应你的用例,处理错误等等。 (使用的地图处理仅适用于R18。对于R17使用maps:put(K, V, M)
,对于R17前使用proplist
。)