Erlang中CYK算法实现的代码审查

时间:2014-04-30 15:47:11

标签: erlang cyk

我开始使用Erlang,作为练习,我尝试实现CYK algorithm

主要代码(cyk.erl):

%%% An attempt for a CYK parser in Erlang

-module(cyk).

-export([
         init_storage/0,
         import_grammar_file/1,
         add_grammar_rule/1,
         analyze/1,
         test_analyze/0
        ]).

%% Initialize the ets storage for grammar
init_storage() ->
  ets:new(?MODULE, [bag, named_table]).

%%------------------------------------------
%% 
%% Grammar
%%
%%------------------------------------------

%% Import a grammar file
import_grammar_file(File) ->
  {ok, Device} = file:open(File, read),
  import_file_rules(Device).

%% Import all the rules in the file
import_file_rules(Device) ->
  case io:get_line(Device, "") of
    eof ->
      io:format("Grammar file imported~n"),
      file:close(Device);
    Line ->
      add_grammar_rule(Line),
      import_file_rules(Device)
  end.

%% Add a grammar rule
add_grammar_rule(Rule) ->
  case re:run(Rule, "^([^\s]+)\s?->\s?([^\n]+)$", [{capture, all_but_first, binary}]) of
    {match, [A, B]} ->
      ets:insert(?MODULE, {A, B}),
      io:format("parsing ~p -> ~p~n", [A, B]);
    nomatch ->
      io:format("cannot parse ~p~n", [Rule])
  end.

%%------------------------------------------
%% 
%% Main logic
%%
%%------------------------------------------

%% Analyze a sentence
analyze(Sentence) ->
  io:format("analysing: ~p~n", [Sentence]),
  WordList = re:split(Sentence, " "),
  io:format("wordlist: ~p~n", [WordList]),
  Representation = lists:map( fun(Word) -> associate(Word) end, WordList),
  io:format("representation: ~p~n", [Representation]),
  Result = process([Representation]),
  io:format("result: ~p~n", [Result]).

% associate sentence words with grammar terms
associate(Word) ->
  case ets:match(cyk, {'$1', Word}) of
    [H|T] -> lists:flatten([H|T]);
    [] -> []
  end.

% process sentence representation
process(Representation) ->
  Limit = length(lists:last(Representation)),
  process(Representation, Limit).

process(Representation, Limit) when Limit > 1 ->
  NextStep = process(Representation, 1, Limit-1, []),
  process([NextStep|Representation], Limit-1);
process(Representation, _Limit) ->
  Representation.

process(Representation, Index, Limit, Acc) when Index =< Limit ->
  Subtree = extract_subtree(lists:reverse(Representation), Index),
  Result = process_subtree(Subtree),
  process(Representation, Index+1, Limit, [Result|Acc]);
process(_Representation, _Index, _Limit, Acc) ->
  lists:reverse(Acc).

%%------------------------------------------
%% 
%% Subtree
%%
%%------------------------------------------

process_subtree(Subtree) ->
  process_subtree(Subtree, Subtree, [], 1).

process_subtree([], _Subtree, Acc, _Index) ->
  Acc;
process_subtree([H|T], Subtree, Acc, Index) ->
  A = lists:nth(1,H),
  Bind = length( Subtree ) - Index + 1,
  B = lists:last( lists:nth( Bind, Subtree) ),
  % generating the possibilities of grammar
  Pos = [ list_to_binary(binary:bin_to_list(X)++" "++binary:bin_to_list(Y)) || X<-A, Y<-B ],
  % looking up in the grammar
  Result = lists:flatten( [ ets:match(cyk, {'$1', X}) || X <- Pos ] ),
  process_subtree(T, Subtree, Acc++Result, Index + 1).

%% Extract a subtree from the representation 
extract_subtree(Representation, Position) ->
  Size = length(Representation) + 1,
  extract_subtree(Representation, Size, Position, []).

extract_subtree([], _Size, _Position, Acc) ->
  lists:reverse(Acc);
extract_subtree([H|T], Size, Position, Acc) ->
  Segment = lists:sublist(H, Position, Size),
  extract_subtree(T, Size - 1, Position, [Segment|Acc]).

%%------------------------------------------
%% 
%% Test
%% using the same example as 
%% http://en.wikipedia.org/wiki/CYK_algorithm
%%
%%------------------------------------------
test_analyze() ->
  init_storage(),
  import_grammar_file("grammar.txt"),
  analyze("she eats a fish with a fork").

语法文件(grammar.txt)

S -> NP VP
VP -> VP PP
VP -> V NP
VP -> eats
PP -> P NP
NP -> Det N
NP -> she 
V -> eats
P -> with
N -> fish
N -> fork
Det -> a

可以从erlang shell

测试代码
> c(cyk).
> cyk:test_analyze().
parsing <<"S">> -> <<"NP VP">>
parsing <<"VP">> -> <<"VP PP">>
parsing <<"VP">> -> <<"V NP">>
parsing <<"VP">> -> <<"eats">>
parsing <<"PP">> -> <<"P NP">>
parsing <<"NP">> -> <<"Det N">>
parsing <<"NP">> -> <<"she">>
parsing <<"V">> -> <<"eats">>
parsing <<"P">> -> <<"with">>
parsing <<"N">> -> <<"fish">>
parsing <<"N">> -> <<"fork">>
parsing <<"Det">> -> <<"a">>
Grammar file imported
analysing: "she eats a fish with a fork"
wordlist: [<<"she">>,<<"eats">>,<<"a">>,<<"fish">>,<<"with">>,<<"a">>,
           <<"fork">>]
representation: [[<<"NP">>],
                 [<<"VP">>,<<"V">>],
                 [<<"Det">>],
                 [<<"N">>],
                 [<<"P">>],
                 [<<"Det">>],
                 [<<"N">>]]
result: [[[<<"S">>]],
         [[],[<<"VP">>]],
         [[],[],[]],
         [[<<"S">>],[],[],[]],
         [[],[<<"VP">>],[],[],[<<"PP">>]],
         [[<<"S">>],[],[<<"NP">>],[],[],[<<"NP">>]],
         [[<<"NP">>],
          [<<"VP">>,<<"V">>],
          [<<"Det">>],
          [<<"N">>],
          [<<"P">>],
          [<<"Det">>],
          [<<"N">>]]]

这个代码的代码似乎工作得很好,但我一直在寻找改进它的方法(使其更加erlang-ish),并特别使处理分布在多个进程/节点上。

我猜每个步骤的所有process_subtree执行都可以同时完成,但我无法确定如何实现。

任何建议都将不胜感激!

1 个答案:

答案 0 :(得分:4)

我写过这个使用并发执行的解决方案。

与Eric解决方案相比,多进程的使用需要进行一些更改,其他因为我认为它更有效(我在规则中恢复了键和值,我选择了一组),有些是因为我认为它更干净(我关闭打开它的函数中的语法文件)和一些因为我更熟悉这些模块(字符串:令牌......)。

<强> [编辑]

我已经通过更快的递归调用替换了无用的spawn,并通过添加消息来同步进程来抑制等待函数。

我明白了这个实现在javascript animation of cyk algorithm

处查看漂亮的动画

@Eric,可以查看分析的所有步骤,用观察者分析ets分析,这就是为什么我不删除它。

-module(cyk).

-export([
         import_grammar_file/1,
         add_grammar_rule/2,
         analyze/1,
         test_analyze/1,
         test_analyze/0
        ]).

%%------------------------------------------
%% 
%% Grammar
%%
%%------------------------------------------

%% Import a grammar file
import_grammar_file(File) ->
  reset_ets(rules, ets:info(rules)),
  {ok, Device} = file:open(File, read),
  ok = add_grammar_rule(Device,file:read_line(Device)),
  file:close(Device),
  io:format("Grammar file imported~n").

%% Add a grammar rule
add_grammar_rule(_,eof) -> ok;
add_grammar_rule(Device,{ok,Rule}) ->
  [T,"->",H|Q] = string:tokens(Rule," \n"),
  Key = key(H,Q),
  insert(Key,T,ets:lookup(rules, Key)),  
  add_grammar_rule(Device,file:read_line(Device)).

key(H,[]) -> H;
key(H,[Q]) -> {H,Q}.

insert(Key,T,[]) -> ets:insert(rules, {Key,[T]});
insert(Key,T,[{Key,L}]) -> ets:insert(rules, {Key,[T|L]}).


%%------------------------------------------
%% 
%% Main logic
%%
%%------------------------------------------

%% Analyze a sentence
analyze(Sentence) ->
  reset_ets(analyze, ets:info(analyze)),
  io:format("analysing: ~p~n", [Sentence]),
  WordList = string:tokens(Sentence, " "),
  Len = length(WordList),
  Me = self(),
  lists:foldl(fun(X,{J,Pid}) -> ets:insert(analyze,{{0,J},ets:lookup_element(rules,X,2)}),
                          (NewPid = spawn(fun() -> whatis(1,J,Len,Pid,Me) end)) ! {done,0},
                          {J+1,NewPid} end,
                        {1,none}, WordList),
  receive
    M -> M
  end.

reset_ets(Name, undefined) -> ets:new(Name,[set, named_table,public]);
reset_ets(Name, _) -> ets:delete_all_objects(Name).

whatis(Len,1,Len,_,PidRet) -> PidRet ! ets:lookup_element(analyze,{Len-1,1},2); % finished
whatis(I,J,Len,_,_) when I + J == Len +1 -> ok; % ends useless processes
whatis(I,J,Len,Pid,PidRet) ->
  receive {done,V} when V == I-1 -> ok end,
  Cases = lists:map(fun({X,Y}) -> [{A,B} || A <- ets:lookup_element(analyze,X,2), 
                                            B <- ets:lookup_element(analyze,Y,2)] end,
                         [{{X-1,J},{I-X,J+X}} || X <- lists:seq(1,I)]),
  Val = lists:foldl(fun(X,Acc) -> case ets:lookup(rules,X) of
                                      [] -> Acc;
                                      [{_,[R]}] -> [R|Acc]
                                      end end,
                                      [],lists:flatten(Cases)),
  ets:insert(analyze,{{I,J},Val}),
  send(Pid,I),
  whatis(I+1,J,Len,Pid,PidRet).

send(none,_) -> ok;
send(Pid,I) -> Pid ! {done,I}.

%%------------------------------------------
%% 
%% Test
%% using the same example as 
%% http://en.wikipedia.org/wiki/CYK_algorithm
%%
%%------------------------------------------
test_analyze(S) ->
  import_grammar_file("grammar.txt"),
  analyze(S).

test_analyze() ->
  test_analyze("she eats a fish with a fork").