Question

Split a string by commas but ignore commas within double-quotes using Javascript提供正则表达式/(".*?"|[^",\s]+)(?=\s*,|\s*$)/。如何在Erlang re:split()中使用它？正则表达式不适用于Erlang。

1> S = "20140419,\"Blah blah, foo foo\",1,0,0,0,1,2,0,0".
2> re:split(S, "(\".*?\"|[^\",\s]+,)(?=\s*,|\s*$)", [{return,list}]).
["20140421,","\"Blah blah, foo foo\"",",1,0,0,0,1,2,0,0"]

我正在寻找的结果是列表

["20140421","\"Blah blah, foo foo\"","1","0","0","0","1","2","0","0"]

感谢。

Answer 1

只需将JavaScript正则表达式翻译为Erlang：

Erlang R16B03-1 (erts-5.10.4) [source] [64-bit] [smp:8:8] [async-threads:10] [kernel-poll:false]

Eshell V5.10.4  (abort with ^G)
1> S = "20140419,\"Blah blah, foo foo\",1,0,0,0,1,2,0,0".
"20140419,\"Blah blah, foo foo\",1,0,0,0,1,2,0,0"
2> {ok,R} = re:compile("(\".*?\"|[^\",\\s]+)(?=\\s*,|\\s*$)").
{ok,{re_pattern,1,0,
                <<69,82,67,80,122,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,
                  ...>>}}
3> {match,Matches} = re:run(S, R, [{capture,[1],list},global]).
{match,[["20140419"],
        ["\"Blah blah, foo foo\""],
        ["1"],
        ["0"],
        ["0"],
        ["0"],
        ["1"],
        ["2"],
        ["0"],
        ["0"]]}
4> [M || [M] <- Matches].
["20140419","\"Blah blah, foo foo\"","1","0","0","0","1",
 "2","0","0"]

在shell命令2中注意在模式中使用双反斜杠来正确指定\s。

Answer 2

使用模式匹配获得解析器。如果有人发现它有用，请在此处添加。

parse_csv(String) -> parse_csv(String, [], [], [], false).

parse_csv([], S, Acc, [], _) -> lists:reverse(lists:map(fun(X) -> lists:reverse(lists:flatten(X)) end, [Acc|S]));
parse_csv([], S, [], L, _) -> lists:reverse(lists:map(fun(X) -> lists:reverse(lists:flatten(X)) end, [L|S]));
parse_csv(String, S, Acc, L, IsSubStr) ->
    case String of
        [$"|T] when IsSubStr =:= true ->
            % end of substring (ending quote).
            parse_csv(T, S, Acc, [$"|L], false);
        [$"|T] when IsSubStr =:= false  ->
            % beginning of a substring (beginning quote).
            parse_csv(T, S, Acc, [$"], true);
        [$,|T] when IsSubStr =:= true andalso L =/= [] ->
            % comma within a substring
            parse_csv(T, S, Acc, [$,|L], true);
        [$,|T] when IsSubStr =:= false andalso L =/= [] ->
            % comma after a substring.
            parse_csv(T, [[L|Acc]|S], [], [], false);
        [$,|T] when IsSubStr =:= false andalso L =:= [] ->
            % comma after a normal string.
            parse_csv(T, [Acc|S], [], [], false);
        [H|T] when IsSubStr =:= true ->
            % within a substring
            parse_csv(T, S, Acc, [H|L], true);
        [H|T] when IsSubStr =:= false ->
            % a normal string
            parse_csv(T, S, [H|Acc], [], false) end.

示例：

2> ql:parse_csv("foo,\"bar aa\",blah,\"dooo\",phew").                          
["foo","\"bar aa\"","blah","\"dooo\"","phew"]
3> ql:parse_csv("foo,bar,baz").
["foo","bar","baz"]
4> ql:parse_csv("foo,\"foo, bar\",baz").
["foo","\"foo, bar\"","baz"]

Answer 3

这个怎么样：

1> string:tokens(S, ",").
["20140419","\"Blah blah"," foo foo\"","1","0","0","0","1","2","0","0"]

甚至：

2> re:split(S, ",", [{return,list}]).
["20140419","\"Blah blah"," foo foo\"","1","0","0","0","1","2","0","0"]

string:tokens/2's doc.

（@ kadaj，这是很多解析CSV的代码）

编辑：要正确回答问题，需要重新组合"\"…"，"…\""对。为此，一个简单的递归函数将执行：

finish([[$\"]++Rest=M, Scnd|T], Acc) ->
    finish(T, [M++Scnd|Acc]);
finish([H|T], Acc) ->
    finish(T, [H      |Acc]);
finish([], Acc) ->
    lists:reverse(Acc).

Erlang：如何用逗号分隔行，但避免引号内的那些？

3 个答案: