我正在努力使这个https://ppolv.wordpress.com/2008/05/09/fun-with-mochiwebs-html-parser-and-xpath/算法工作。所以一切都在编译并且工作得很好(我得到了html页面的大小)但是:
我的期望:
html的大小
图像大小
脚本大小
我得到了什么:
html的大小
无论什么
,图像的大小等于零无论什么
,脚本的大小都等于零我试图找到错误或者我错过了好几个小时但我不知道出了什么问题。 代码:
-module(test).
-author("Hubert").
%% API
-export([printing/4]).
-export([page_info/1]).
-export([got_page_info/3]).
-export([content_length/1]).
-export([spawn_workers/3]).
-export([get_info/2]).
-export([get_url_context/1]).
-export([wait_for_responses/2]).
%declaring record that will hold number of images, css and scripts
-record(state, {page,timer,errors,img,css,script}).
page_info(URL) ->
inets:start(),
case httpc:request(URL) of
{ok,{_,Headers,Body}} ->
got_page_info(URL,content_length(Headers),Body);
{error,Reason} ->
{error,Reason}
end.
got_page_info(URLpassed, PageSize,Body) ->
%getting the parsed version of website
Tree = mochiweb_html:parse(Body),
%particular files being listed and removing duplicates
Imgs = rDup(mochiweb_xpath:execute("//img/@src",Tree)),
%css does not work, do not know why
%Css = rDup(mochiweb_xpath:execute("//link[@rel=’stylesheet’]/@href",Tree)),
Scripts = rDup(mochiweb_xpath:execute("//script/@src",Tree)),
%preapring URL
URL = get_url_context(URLpassed),
spawn_workers(URL,img,lists:map(fun binary_to_list/1,Imgs)),
spawn_workers(URL,script,lists:map(fun binary_to_list/1,Scripts)),
%Starts a timer which will send the message Msg to Dest after Time milliseconds.
TRef = erlang:send_after(10000,self(),timeout),
State = #state{page=PageSize,
timer=TRef,
errors=[],
img=0,
css=0,
script=0},
%number of elements -> so number of responses we should wait for
wait_for_responses(State,length(Imgs) + length(Scripts)),
{ok}.
content_length(Headers) ->
%proplists:get_value(Key,List,Default)
%returns the length of the content
list_to_integer(proplists:get_value("content-length",Headers,"0")).
%function that removes dulpicate
rDup(L) ->
sets:to_list(sets:from_list(L)).
%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
lists:foreach(fun (Url) -> spawn( fun () ->
self() ! {component, Type,Url,get_info(URLctx,Url)}
end)
end, URLs).
get_url_context(URL) ->
{ok,{http,_,Root,_Port,Path,_Query}} = http_uri:parse(URL),
Ctx = string:sub_string(Path,1, string:rstr(Path,"/")),
{"http://"++Root,Ctx}. %% gib my url with context
get_info(URlctx,Url) ->
FullURL = full_url(URlctx,Url),
case httpc:request(head,{FullURL,[]},[],[]) of
{ok, {_,Headers,_Body}} ->
{ok,content_length(Headers)};
{error,Reason} ->
{error,Reason}
end.
%FULL URL FUNCTIONS
%% abs url inside the same server ej: /img/image.png
full_url({Root,_Context},ComponentUrl=[$/|_]) ->
Root ++ ComponentUrl;
%% full url ej: http://other.com/img.png
full_url({_Root,_Context},ComponentUrl="http://"++_) ->
ComponentUrl;
% everything else is considerer a relative path.. obviously its wrong (../img)
full_url({Root,Context},ComponentUrl) ->
Root ++ Context ++ "/" ++ ComponentUrl.
%collect infos recieved from wait_for_resposnses and add them to proper field of State
collect_info(State = #state{css=Css},css,_URL,{ok,Info}) ->
State#state{css = Css + Info};
collect_info(State = #state{img=Img},img,_URL,{ok,Info}) ->
State#state{img = Img + Info};
collect_info(State = #state{script=Script},script,_URL,{ok,Info}) ->
State#state{script = Script + Info};
collect_info(State = #state{errors=Errors},_Type,URL,{error,Reason}) ->
State#state{errors=[{URL,Reason}|Errors]}.
%messages from workers
wait_for_responses(State,0) ->
finalize(State,0);
wait_for_responses(State,Counter) ->
receive
{component,Type,URL,Info} ->
wait_for_responses(collect_info(State,Type,URL,Info),Counter - 1);
timeout -> finalize(State,Counter)
end.
%prepares variables for printing
finalize(State,Left) ->
PageSize = State#state.page,
ImgSize = State#state.img,
CssSize = State#state.css, %maybe one day will work
ScriptSize = State#state.script,
Errors = State#state.errors,
TRef = State#state.timer,
erlang:cancel_timer(TRef),
printing(PageSize,ImgSize,CssSize,ScriptSize).
printing(PageSize,ImgSize,CssSize,ScriptSize)->
io:format("html size: ~.2fkb~n",[PageSize/1024]),
io:format("images size: ~.2fkb~n",[ImgSize/1024]),
io:format("script size: ~.2fkb~n",[ScriptSize/1024]),
% io:format("stylesheet size: ~.2fkb~n",[CssSize/1024]),
{ok}.
答案 0 :(得分:1)
问题在于功能:
spawn_workers(URLctx,Type,URLs) ->
lists:foreach(fun (Url) -> spawn( fun () ->
self() ! {component, Type,Url,get_info(URLctx,Url)}
end)
end, URLs).
self()在生成的进程中进行评估,因此它将响应发送给自己。在生成进程之前将self分配给变量:
spawn_workers(URLctx,Type,URLs) ->
Pid = self(),
lists:foreach(fun (Url) -> spawn( fun () ->
Pid ! {component, Type,Url,get_info(URLctx,Url)}
end)
end, URLs).
答案 1 :(得分:1)
而不是显示错误的位置,我将向您展示如何使用dbg
调试它,这是Erlang调试器。用这些命令启动它:
dbg:tracer(). #start the process
dbg:p(all, c). #match all calls in patterns given later
ShowReturnedResults = [{'_', [], [{return_trace}]}] #find this magic in the docs
dbg:tpl(test, get_info, '_', ShowReturnedResults).
test:page_info("http://www.lambdadays.org").
这将告诉您,每个图片都调用get_info/2
并返回一些结果。
所以问题必须是收集结果,让我们检查wait_for_responses/2
:
dbg:stop_clear(). #clears all traces
dbg:tracer().
dbg:p(all, c).
ShowReturnedResults = [{'_', [], [{return_trace}]}] #find this magic in the docs
dbg:tpl(test, wait_for_responses, '_', ShowReturnedResults).
test:page_info("http://www.lambdadays.org").
糟糕。它只被调用一次。这意味着,它达到了超时。让我们看看在此调用期间发送的消息。因为io:format
发送了大量消息,所以我们可以在另一个进程中生成该函数。
Pid = spawn(fun() -> test:page_info("http://www.lambdadays.org") end),
dbg:p(Pid, [sos, m]). #print all messages, sent and received by this process and processes, that it spawned.
你应该收到很多消息,但我们只对返回的元组感兴趣:{component,img...}
,所以你可以找到这样的东西:
(<0.200.0>) <0.200.0> ! {component,img,
"/static/upload/media/1407924850920422agh.png",
{ok,189930}}
(<0.200.0>) << {component,img,"/static/upload/media/1407924850920422agh.png",
{ok,189930}}
(<0.199.0>) <0.199.0> ! {component,img,
"/static/upload/media/1407659467205755logo_glowna.png",
{ok,6424}}
(<0.199.0>) << {component,img,
"/static/upload/media/1407659467205755logo_glowna.png",
{ok,6424}}
您可以将其读作:
但他们为什么要把它发给自己呢?
%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
lists:foreach(fun (Url) -> spawn( fun () ->
self() ! {component, Type,Url,get_info(URLctx,Url)}
end)
end, URLs).
内部fun
在新生成的进程的上下文中进行评估,因此进程将消息发送给自己,而不是父进程。您必须在父级上下文中评估self()
并将其传递给变量。
%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
Parent = self(),
lists:foreach(fun (Url) -> spawn( fun () ->
Parent ! {component, Type,Url,get_info(URLctx,Url)}
end)
end, URLs).