erlang中的csv解析器

时间:2009-10-07 14:41:41

标签: csv erlang

对于我的应用程序,我必须使用Erlang解析CSV文件。以下是使用Erlang解析CSV的代码: -

parse_file(Fn) ->
{ok, Data} = file:read_file(Fn),
parse(binary_to_list(Data)).

parse(Data) -> lists:reverse(parse(Data, [])).

parse([], Acc) -> Acc;
parse(Data, Acc) ->
{Line, Tail} = parse_line(Data),
parse(Tail, [Line|Acc]).

parse_line(Data) ->
{Line, Tail} = parse_line(Data, []),
{lists:reverse(Line), Tail}.

parse_line([13,10|Data], Acc) -> {Acc, Data};
parse_line([10|Data], Acc) -> {Acc, Data};
parse_line([13|Data], Acc) -> {Acc, Data};
parse_line([], Acc) -> {Acc, []};
parse_line([$,,$,|Data], Acc) -> parse_line(Data, [""|Acc]);
parse_line([$,|Data], Acc) -> parse_line(Data, Acc);
parse_line(Data, Acc) ->
{Fld, Tail} = parse_field(Data),
parse_line(Tail, [Fld|Acc]).

parse_field([34|Data]) ->
{Fld, Tail} = parse_fieldq(Data, ""),
{lists:reverse(Fld), Tail};
parse_field(Data) ->
{Fld, Tail} = parse_field(Data, ""),
{lists:reverse(Fld), Tail}.

parse_field([$,|Tail], Acc) -> {Acc, [$,|Tail]};
parse_field([13|Tail], Acc) -> {Acc, [13|Tail]};
parse_field([10|Tail], Acc) -> {Acc, [10|Tail]};
parse_field([], Acc) -> {Acc, []};
parse_field([Ch|Tail], Acc) -> parse_field(Tail, [Ch|Acc]).

parse_fieldq([34,34|Tail], Acc) -> parse_fieldq(Tail, [34|Acc]);
parse_fieldq([34|Tail], Acc) -> {Acc, Tail};
parse_fieldq([Ch|Tail], Acc) -> parse_fieldq(Tail, [Ch|Acc]).

此代码工作正常,但有两个问题: - 1 - 因为代码使用双引号(“”)和逗号(,)进行解析并将每个值分开..但在下面的示例中,如果第一个名称包含双引号sting,则解析器将再创建一个字段。

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------"All Pain Will End."","","itisashwani4u@gmail.com"

result:-
[["contact"],["Ashwani  Garg ------"],["All Pain Will End."],[],["itisashwani4u@gmail.com"]]

expected result:-
[["contact"],["Ashwani  Garg ------All Pain Will End."],[],["itisashwani4u@gmail.com"]]

2 - 对于以下类型的csv它的值,它截断一些值: - 名字,姓氏,中间名,姓名,昵称,电子邮件地址,家庭街,家乡城市,家庭邮政编码,家庭状态,家乡/地区,家庭电话,家庭传真,移动电话,个人网页,商务街,商业城,商业邮政编码,商业状态,商业国家/地区,商业网页,商务电话,商业传真,寻呼机,公司,职位,部门,办公地点,备注

    Affection,,,Affection,,,,,,,,+919845141544,,+919845141544,,,,,,,,,,,,,,,
    result:-
    [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[]]
    expected result:-
   [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

请帮助我...参考请使用以下链接: -   http://ppolv.wordpress.com/2008/02/25/parsing-csv-in-erlang/

8 个答案:

答案 0 :(得分:6)

parse(File) ->
  {ok, F} = file:open(File, [read, raw]),
  parse(F, file:read_line(F), []).

parse(F, eof, Done) ->
  file:close(F),
  lists:reverse(Done);    

parse(F, Line, Done) ->
  parse(F, file:read_line(F), [parse_line(Line)|Done]).



parse_line(Line) -> parse_line(Line, []).

parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).

parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).

parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).

parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).

没有文件:read_line:

parse_file(File) ->
  {ok, Data} = file:read_file(File),
  parse(binary_to_list(Data), []).

parse([], Done) ->
  lists:reverse(Done);

parse(Data, Done) ->
  {Line, Rest} = case re:split(Data, "\r|\n|\r\n", [{return, list}, {parts, 2}]) of
                   [L,R] -> {L,R};
                   [L]   -> {L,[]}
                 end,
  parse(Rest, [parse_line(Line)|Done]).

答案 1 :(得分:2)

副作用:

您是如何创建CSV输入的?它似乎不是有效的CSV(不过对CSV有特别严格的规范)。

通常在CSV字段中使用双引号,它们需要作为一对双引号进行转义,因此您的示例将是:

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------""All Pain Will End.""","","itisashwani4u@gmail.com"

这将导入到开放式办公室电子表格中,而原始示例则​​没有。

答案 2 :(得分:1)

Trapexit中也讨论了从文件中读取行。根据您的需求调整应该是微不足道的:

http://www.trapexit.org/Reading_Lines_from_a_File

答案 3 :(得分:1)

我的实施:

-module(csv).

-export([
    parse/1
]).

parse(File) ->
    try
        {ok, Bin} = file:read_file(File),
        {ok, parse(binary_to_list(Bin), [], [], [])}
    catch
        Class:Error ->
            {Class, Error}
    end.

parse([], _FBuff, _RBuff, Result) ->
    lists:reverse(Result);
parse([$" | Rest], _FBuff, RBuff, Result) ->
    {F, Rest1} = parse_q(Rest, []),
    parse(Rest1, [], [F | RBuff], Result);
parse([$,, $\s| Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);    
parse([$, | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$\r, $\n | Rest], _FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse(RBuff) | Result]);
parse([$\n | Rest], _FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse(RBuff) | Result]);
parse([A | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [A | FBuff], RBuff, Result).

parse_q([$", $, | Rest], Result) ->
    {lists:reverse(Result), Rest};
parse_q([A | Rest], Result) ->
    parse_q(Rest, [A | Result]).

但是,此解决方案无法处理嵌套引号...

例如:

1,“你好,”世界“”,“她说:”这是“解决方案”,不是吗?“”,2000 \ r \ n

答案 4 :(得分:1)

前几天我遇到了你的实施,并开始玩它。

我也让你成为解析器。

-module(csv_parser).

-export([parse_file/1]).

parse_file(File) ->
  {ok, Data} = file:read_file(File),
  parse(Data).

parse(Data) ->
    Lines = re:split(Data, "\r|\n|\r\n", [] ), 
    [ [begin
           case  re:split(Token, "\"", [] ) of 
               [_,T,_] -> T;
               [] -> <<"">>
           end
       end || Token <- re:split(Line, ",", [] ) ] || Line <- Lines, Line =/= <<"">>].

我甚至在这个csv parser

上写了一篇小博文

答案 5 :(得分:0)

另一种可能的解决方案。可以轻松更改为延迟评估,因此无需立即读取整个文件。

parse(Data) -> parse(Data, [], [], []).

parse([$\r|Data], Field, Fields, Lines) -> parse_r(Data, Field, Fields, Lines);
parse([$\n|Data], Field, Fields, Lines) -> parse(Data, [], [], [[Field|Fields]|Lines]);
parse([$,|Data], Field, Fields, Lines)  -> parse(Data, [], [Field|Fields], Lines);
parse([$"|Data], [], Fields, Lines)     -> parse_q(Data, [], Fields, Lines);
parse([C|Data], Field, Fields, Lines)   -> parse(Data, [C|Field], Fields, Lines);
parse([], Field, Fields, Lines)         -> 
  lists:reverse(
      [lists:reverse(
          [lists:reverse(F) || F <- L]
        ) || L <- [[Field|Fields]|Lines]]
    ).

parse_r([$\n|_] = Data, Field, Fields, Lines) -> parse(Data, Field, Fields, Lines).

parse_q([$"|Data], Field, Fields, Lines) -> parse_qq(Data, Field, Fields, Lines);
parse_q([C|Data], Field, Fields, Lines)  -> parse_q(Data, [C|Field], Fields, Lines).

parse_qq([$"|Data], Field, Fields, Lines) -> parse_q(Data, [$"|Field], Fields, Lines);
parse_qq([C|_] = Data, Field, Fields, Lines)  
  when C == $,; C == $\r; C == $\n        -> parse(Data, Field, Fields, Lines);
parse_qq([], Field, Fields, Lines)        -> parse([], Field, Fields, Lines).

答案 6 :(得分:0)

我在zed的答案中添加了几项增强功能。

-module (helper_csv_parser).
-compile(export_all).

% Taken from http://stackoverflow.com/questions/1532081/csv-parser-in-erlang, modified to fix errors.
parse(File) ->
    {ok, F} = file:open(File, [read, {encoding, utf8}]),
    {ok, L} = file:read_line(F),
    parse(F, string:strip(L, right, $\n), [], 1).

parse(F, eof, Done, _) ->
    file:close(F),
    lists:reverse(Done);

parse(F, Line, Done, Ctr) ->
    Res = file:read_line(F),

    case Res of
        {error,collect_line} -> throw({error, "Might be unicode at line " ++ helper:i2s(Ctr)});
        {ok, L} -> parse(F, string:strip(L, right, $\n),[parse_line(Line)|Done], Ctr+1);
        eof -> parse(F,eof,[parse_line(Line)|Done], Ctr+1)
    end.

parse_line("," ++ Line) -> parse_line(Line, [[]]);
parse_line(Line) -> parse_line(Line, []).

parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).

parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).

parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [string:strip(lists:reverse(Buf))|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).

parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [string:strip(lists:reverse(Buf))|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).

答案 7 :(得分:0)

Wisher的回答并不错,除了它丢失了每个csv行的最后一个元素。这是对此的修复。它仍然没有处理嵌入式引用。

-module(csv).

-export([read/1]).

read(File) ->
    try
        {ok, Bin} = file:read_file(File),
        {ok, parse(binary_to_list(Bin), [], [], [])}
    catch
        Class:Error ->
            {Class, Error}
    end.

parse([], _FBuff, _RBuff, Result) ->
    lists:reverse(Result);
parse([$" | Rest], _FBuff, RBuff, Result) ->
    {F, Rest1} = parse_q(Rest, []),
    parse(Rest1, [], [F | RBuff], Result);
parse([$,, $\s| Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$, | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$\r, $\n | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([$\n | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([A | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [A | FBuff], RBuff, Result).

parse_q([$", $, | Rest], Result) ->
    {lists:reverse(Result), Rest};
parse_q([A | Rest], Result) ->
    parse_q(Rest, [A | Result]).