对于我的应用程序,我必须使用Erlang解析CSV文件。以下是使用Erlang解析CSV的代码: -
parse_file(Fn) ->
{ok, Data} = file:read_file(Fn),
parse(binary_to_list(Data)).
parse(Data) -> lists:reverse(parse(Data, [])).
parse([], Acc) -> Acc;
parse(Data, Acc) ->
{Line, Tail} = parse_line(Data),
parse(Tail, [Line|Acc]).
parse_line(Data) ->
{Line, Tail} = parse_line(Data, []),
{lists:reverse(Line), Tail}.
parse_line([13,10|Data], Acc) -> {Acc, Data};
parse_line([10|Data], Acc) -> {Acc, Data};
parse_line([13|Data], Acc) -> {Acc, Data};
parse_line([], Acc) -> {Acc, []};
parse_line([$,,$,|Data], Acc) -> parse_line(Data, [""|Acc]);
parse_line([$,|Data], Acc) -> parse_line(Data, Acc);
parse_line(Data, Acc) ->
{Fld, Tail} = parse_field(Data),
parse_line(Tail, [Fld|Acc]).
parse_field([34|Data]) ->
{Fld, Tail} = parse_fieldq(Data, ""),
{lists:reverse(Fld), Tail};
parse_field(Data) ->
{Fld, Tail} = parse_field(Data, ""),
{lists:reverse(Fld), Tail}.
parse_field([$,|Tail], Acc) -> {Acc, [$,|Tail]};
parse_field([13|Tail], Acc) -> {Acc, [13|Tail]};
parse_field([10|Tail], Acc) -> {Acc, [10|Tail]};
parse_field([], Acc) -> {Acc, []};
parse_field([Ch|Tail], Acc) -> parse_field(Tail, [Ch|Acc]).
parse_fieldq([34,34|Tail], Acc) -> parse_fieldq(Tail, [34|Acc]);
parse_fieldq([34|Tail], Acc) -> {Acc, Tail};
parse_fieldq([Ch|Tail], Acc) -> parse_fieldq(Tail, [Ch|Acc]).
此代码工作正常,但有两个问题: - 1 - 因为代码使用双引号(“”)和逗号(,)进行解析并将每个值分开..但在下面的示例中,如果第一个名称包含双引号sting,则解析器将再创建一个字段。
"Type","First Name","Last Name","Email"
"Contact","Ashwani Garg ------"All Pain Will End."","","itisashwani4u@gmail.com"
result:-
[["contact"],["Ashwani Garg ------"],["All Pain Will End."],[],["itisashwani4u@gmail.com"]]
expected result:-
[["contact"],["Ashwani Garg ------All Pain Will End."],[],["itisashwani4u@gmail.com"]]
2 - 对于以下类型的csv它的值,它截断一些值: - 名字,姓氏,中间名,姓名,昵称,电子邮件地址,家庭街,家乡城市,家庭邮政编码,家庭状态,家乡/地区,家庭电话,家庭传真,移动电话,个人网页,商务街,商业城,商业邮政编码,商业状态,商业国家/地区,商业网页,商务电话,商业传真,寻呼机,公司,职位,部门,办公地点,备注
Affection,,,Affection,,,,,,,,+919845141544,,+919845141544,,,,,,,,,,,,,,,
result:-
[["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[]]
expected result:-
[["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
请帮助我...参考请使用以下链接: - http://ppolv.wordpress.com/2008/02/25/parsing-csv-in-erlang/
答案 0 :(得分:6)
parse(File) ->
{ok, F} = file:open(File, [read, raw]),
parse(F, file:read_line(F), []).
parse(F, eof, Done) ->
file:close(F),
lists:reverse(Done);
parse(F, Line, Done) ->
parse(F, file:read_line(F), [parse_line(Line)|Done]).
parse_line(Line) -> parse_line(Line, []).
parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).
parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).
parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).
parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).
没有文件:read_line:
parse_file(File) ->
{ok, Data} = file:read_file(File),
parse(binary_to_list(Data), []).
parse([], Done) ->
lists:reverse(Done);
parse(Data, Done) ->
{Line, Rest} = case re:split(Data, "\r|\n|\r\n", [{return, list}, {parts, 2}]) of
[L,R] -> {L,R};
[L] -> {L,[]}
end,
parse(Rest, [parse_line(Line)|Done]).
答案 1 :(得分:2)
副作用:
您是如何创建CSV输入的?它似乎不是有效的CSV(不过对CSV有特别严格的规范)。
通常在CSV字段中使用双引号,它们需要作为一对双引号进行转义,因此您的示例将是:
"Type","First Name","Last Name","Email"
"Contact","Ashwani Garg ------""All Pain Will End.""","","itisashwani4u@gmail.com"
这将导入到开放式办公室电子表格中,而原始示例则没有。
答案 2 :(得分:1)
在Trapexit中也讨论了从文件中读取行。根据您的需求调整应该是微不足道的:
答案 3 :(得分:1)
我的实施:
-module(csv).
-export([
parse/1
]).
parse(File) ->
try
{ok, Bin} = file:read_file(File),
{ok, parse(binary_to_list(Bin), [], [], [])}
catch
Class:Error ->
{Class, Error}
end.
parse([], _FBuff, _RBuff, Result) ->
lists:reverse(Result);
parse([$" | Rest], _FBuff, RBuff, Result) ->
{F, Rest1} = parse_q(Rest, []),
parse(Rest1, [], [F | RBuff], Result);
parse([$,, $\s| Rest], FBuff, RBuff, Result) ->
parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$, | Rest], FBuff, RBuff, Result) ->
parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$\r, $\n | Rest], _FBuff, RBuff, Result) ->
parse(Rest, [], [], [lists:reverse(RBuff) | Result]);
parse([$\n | Rest], _FBuff, RBuff, Result) ->
parse(Rest, [], [], [lists:reverse(RBuff) | Result]);
parse([A | Rest], FBuff, RBuff, Result) ->
parse(Rest, [A | FBuff], RBuff, Result).
parse_q([$", $, | Rest], Result) ->
{lists:reverse(Result), Rest};
parse_q([A | Rest], Result) ->
parse_q(Rest, [A | Result]).
但是,此解决方案无法处理嵌套引号...
例如:
1,“你好,”世界“”,“她说:”这是“解决方案”,不是吗?“”,2000 \ r \ n
答案 4 :(得分:1)
前几天我遇到了你的实施,并开始玩它。
我也让你成为解析器。
-module(csv_parser).
-export([parse_file/1]).
parse_file(File) ->
{ok, Data} = file:read_file(File),
parse(Data).
parse(Data) ->
Lines = re:split(Data, "\r|\n|\r\n", [] ),
[ [begin
case re:split(Token, "\"", [] ) of
[_,T,_] -> T;
[] -> <<"">>
end
end || Token <- re:split(Line, ",", [] ) ] || Line <- Lines, Line =/= <<"">>].
我甚至在这个csv parser
上写了一篇小博文答案 5 :(得分:0)
另一种可能的解决方案。可以轻松更改为延迟评估,因此无需立即读取整个文件。
parse(Data) -> parse(Data, [], [], []).
parse([$\r|Data], Field, Fields, Lines) -> parse_r(Data, Field, Fields, Lines);
parse([$\n|Data], Field, Fields, Lines) -> parse(Data, [], [], [[Field|Fields]|Lines]);
parse([$,|Data], Field, Fields, Lines) -> parse(Data, [], [Field|Fields], Lines);
parse([$"|Data], [], Fields, Lines) -> parse_q(Data, [], Fields, Lines);
parse([C|Data], Field, Fields, Lines) -> parse(Data, [C|Field], Fields, Lines);
parse([], Field, Fields, Lines) ->
lists:reverse(
[lists:reverse(
[lists:reverse(F) || F <- L]
) || L <- [[Field|Fields]|Lines]]
).
parse_r([$\n|_] = Data, Field, Fields, Lines) -> parse(Data, Field, Fields, Lines).
parse_q([$"|Data], Field, Fields, Lines) -> parse_qq(Data, Field, Fields, Lines);
parse_q([C|Data], Field, Fields, Lines) -> parse_q(Data, [C|Field], Fields, Lines).
parse_qq([$"|Data], Field, Fields, Lines) -> parse_q(Data, [$"|Field], Fields, Lines);
parse_qq([C|_] = Data, Field, Fields, Lines)
when C == $,; C == $\r; C == $\n -> parse(Data, Field, Fields, Lines);
parse_qq([], Field, Fields, Lines) -> parse([], Field, Fields, Lines).
答案 6 :(得分:0)
我在zed的答案中添加了几项增强功能。
-module (helper_csv_parser).
-compile(export_all).
% Taken from http://stackoverflow.com/questions/1532081/csv-parser-in-erlang, modified to fix errors.
parse(File) ->
{ok, F} = file:open(File, [read, {encoding, utf8}]),
{ok, L} = file:read_line(F),
parse(F, string:strip(L, right, $\n), [], 1).
parse(F, eof, Done, _) ->
file:close(F),
lists:reverse(Done);
parse(F, Line, Done, Ctr) ->
Res = file:read_line(F),
case Res of
{error,collect_line} -> throw({error, "Might be unicode at line " ++ helper:i2s(Ctr)});
{ok, L} -> parse(F, string:strip(L, right, $\n),[parse_line(Line)|Done], Ctr+1);
eof -> parse(F,eof,[parse_line(Line)|Done], Ctr+1)
end.
parse_line("," ++ Line) -> parse_line(Line, [[]]);
parse_line(Line) -> parse_line(Line, []).
parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).
parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).
parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [string:strip(lists:reverse(Buf))|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).
parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [string:strip(lists:reverse(Buf))|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).
答案 7 :(得分:0)
Wisher的回答并不错,除了它丢失了每个csv行的最后一个元素。这是对此的修复。它仍然没有处理嵌入式引用。
-module(csv).
-export([read/1]).
read(File) ->
try
{ok, Bin} = file:read_file(File),
{ok, parse(binary_to_list(Bin), [], [], [])}
catch
Class:Error ->
{Class, Error}
end.
parse([], _FBuff, _RBuff, Result) ->
lists:reverse(Result);
parse([$" | Rest], _FBuff, RBuff, Result) ->
{F, Rest1} = parse_q(Rest, []),
parse(Rest1, [], [F | RBuff], Result);
parse([$,, $\s| Rest], FBuff, RBuff, Result) ->
parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$, | Rest], FBuff, RBuff, Result) ->
parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$\r, $\n | Rest], FBuff, RBuff, Result) ->
parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([$\n | Rest], FBuff, RBuff, Result) ->
parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([A | Rest], FBuff, RBuff, Result) ->
parse(Rest, [A | FBuff], RBuff, Result).
parse_q([$", $, | Rest], Result) ->
{lists:reverse(Result), Rest};
parse_q([A | Rest], Result) ->
parse_q(Rest, [A | Result]).