Quis custodiet ipsos custodes? - (Decimus Iunius Iuvenalis)
我有以下设置:
在一个节点('one@erlang.enzo')上运行的服务器进程有一个监视程序运行另一个节点('two@erlang.enzo')。当服务器启动时,它将在远程节点上启动其监视程序。当服务器退出时,监视程序再次启动服务器。当看门狗退出时,服务器再次启动它。
网络启动后,服务器将作为运行级别的一部分启动。
服务器还监视远程节点,并在它(即节点)联机时立即启动监视程序。现在服务器和看门狗之间的连接损失有两个原因:第一,网络可能会下降;第二,节点可能崩溃或被杀死。
我的代码似乎可以工作,但我有点怀疑以下情况:
我的问题是:
编辑: die
和kill_dog
消息用于伪装非正常退出,并且不会超出调试范围。
以下是代码:
-module (watchdog).
-compile (export_all).
init () ->
io:format ("Watchdog: Starting @ ~p.~n", [node () ] ),
process_flag (trap_exit, true),
loop ().
loop () ->
receive
die -> 1 / 0;
{'EXIT', _, normal} ->
io:format ("Watchdog: Server shut down.~n");
{'EXIT', _, _} ->
io:format ("Watchdog: Restarting server.~n"),
spawn ('one@erlang.enzo', server, start, [] );
_ -> loop ()
end.
-module (server).
-compile (export_all).
start () ->
io:format ("Server: Starting up.~n"),
register (server, spawn (fun init/0) ).
stop () ->
whereis (server) ! stop.
init () ->
process_flag (trap_exit, true),
monitor_node ('two@erlang.enzo', true),
loop (down, none).
loop (Status, Watchdog) ->
{NewStatus, NewWatchdog} = receive
die -> 1 / 0;
stop -> {stop, none};
kill_dog ->
Watchdog ! die,
{Status, Watchdog};
{nodedown, 'two@erlang.enzo'} ->
io:format ("Server: Watchdog node has gone down.~n"),
{down, Watchdog};
{'EXIT', Watchdog, noconnection} ->
{Status, Watchdog};
{'EXIT', Watchdog, Reason} ->
io:format ("Server: Watchdog has died of ~p.~n", [Reason] ),
{Status, spawn_link ('two@erlang.enzo', watchdog, init, [] ) };
_ -> {Status, Watchdog}
after 2000 ->
case Status of
down -> checkNode ();
up -> {up, Watchdog}
end
end,
case NewStatus of
stop -> ok;
_ -> loop (NewStatus, NewWatchdog)
end.
checkNode () ->
net_adm:world (),
case lists:any (fun (Node) -> Node =:= 'two@erlang.enzo' end, nodes () ) of
false ->
io:format ("Server: Watchdog node is still down.~n"),
{down, none};
true ->
io:format ("Server: Watchdog node has come online.~n"),
monitor_node ('two@erlang.enzo', true),
Watchdog = spawn_link ('two@erlang.enzo', watchdog, init, [] ),
{up, Watchdog}
end.
答案 0 :(得分:1)
使用global
模块注册看门狗可以防止您的担忧:
watchdog.erl:
-module (watchdog).
-compile (export_all).
init () ->
io:format ("Watchdog: Starting @ ~p.~n", [node () ] ),
process_flag (trap_exit, true),
global:register_name (watchdog, self ()),
loop ().
loop () ->
receive
die -> 1 / 0;
{'EXIT', _, normal} ->
io:format ("Watchdog: Server shut down.~n");
{'EXIT', _, _} ->
io:format ("Watchdog: Restarting server.~n"),
spawn ('one@erlang.enzo', server, start, [] );
_ -> loop ()
end.
server.erl:
checkNode () ->
net_adm:world (),
case lists:any (fun (Node) -> Node =:= 'two@erlang.enzo' end, nodes () ) of
false ->
io:format ("Server: Watchdog node is still down.~n"),
{down, none};
true ->
io:format ("Server: Watchdog node has come online.~n"),
global:sync (), %% not sure if this is necessary
case global:whereis_name (watchdog) of
undefined ->
io:format ("Watchdog process is dead"),
Watchdog = spawn_link ('two@erlang.enzo', watchdog, init, [] );
Watchdog ->
io:format ("Watchdog process is still alive")
end,
{up, Watchdog}
end.