使用mochiweb解析器不计算图像大小的算法

时间:2015-01-17 00:30:46

标签: erlang mochiweb

我正在努力使这个https://ppolv.wordpress.com/2008/05/09/fun-with-mochiwebs-html-parser-and-xpath/算法工作。所以一切都在编译并且工作得很好(我得到了html页面的大小)但是:

我的期望:

html的大小

图像大小

脚本大小

我得到了什么:

html的大小

无论什么

,图像的大小等于零

无论什么

,脚本的大小都等于零

我试图找到错误或者我错过了好几个小时但我不知道出了什么问题。 代码:

-module(test).
-author("Hubert").

%% API
-export([printing/4]).
-export([page_info/1]).
-export([got_page_info/3]).
-export([content_length/1]).
-export([spawn_workers/3]).
-export([get_info/2]).
-export([get_url_context/1]).
-export([wait_for_responses/2]).

%declaring record that will hold number of images, css and scripts
-record(state, {page,timer,errors,img,css,script}).

page_info(URL) ->
  inets:start(),
  case httpc:request(URL) of
    {ok,{_,Headers,Body}} ->
      got_page_info(URL,content_length(Headers),Body);
    {error,Reason} ->
      {error,Reason}
  end.

got_page_info(URLpassed, PageSize,Body) ->
  %getting the parsed version of website
  Tree = mochiweb_html:parse(Body),

  %particular files being listed and removing duplicates
  Imgs = rDup(mochiweb_xpath:execute("//img/@src",Tree)),
  %css does not work, do not know why
  %Css = rDup(mochiweb_xpath:execute("//link[@rel=’stylesheet’]/@href",Tree)),
  Scripts = rDup(mochiweb_xpath:execute("//script/@src",Tree)),

  %preapring URL
  URL = get_url_context(URLpassed),
      spawn_workers(URL,img,lists:map(fun  binary_to_list/1,Imgs)),
  spawn_workers(URL,script,lists:map(fun  binary_to_list/1,Scripts)),
  %Starts a timer which will send the message Msg to Dest after Time milliseconds.
  TRef = erlang:send_after(10000,self(),timeout),
  State = #state{page=PageSize,
    timer=TRef,
    errors=[],
    img=0,
    css=0,
    script=0},

  %number of elements -> so number of responses we should wait for
  wait_for_responses(State,length(Imgs)  + length(Scripts)),
  {ok}.

content_length(Headers) ->
  %proplists:get_value(Key,List,Default)
  %returns the length of the content
  list_to_integer(proplists:get_value("content-length",Headers,"0")).

%function that removes dulpicate
rDup(L) ->
  sets:to_list(sets:from_list(L)).

%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
  lists:foreach(fun (Url) -> spawn( fun () ->
                                    self() ! {component, Type,Url,get_info(URLctx,Url)}
                                    end)
              end, URLs).

get_url_context(URL) ->
  {ok,{http,_,Root,_Port,Path,_Query}} = http_uri:parse(URL),
  Ctx = string:sub_string(Path,1, string:rstr(Path,"/")),
  {"http://"++Root,Ctx}. %% gib my url with context

get_info(URlctx,Url) ->
  FullURL = full_url(URlctx,Url),
  case httpc:request(head,{FullURL,[]},[],[]) of
    {ok, {_,Headers,_Body}} ->
      {ok,content_length(Headers)};
    {error,Reason} ->
      {error,Reason}
  end.


%FULL URL FUNCTIONS
%% abs url inside the same server ej: /img/image.png
full_url({Root,_Context},ComponentUrl=[$/|_]) ->
  Root ++ ComponentUrl;
%% full url ej: http://other.com/img.png
full_url({_Root,_Context},ComponentUrl="http://"++_) ->
  ComponentUrl;
% everything else is considerer a relative path.. obviously its wrong (../img)
full_url({Root,Context},ComponentUrl) ->
  Root ++ Context ++ "/" ++ ComponentUrl.

%collect infos recieved from wait_for_resposnses and add them to proper field of State
collect_info(State = #state{css=Css},css,_URL,{ok,Info}) ->
         State#state{css = Css + Info};
collect_info(State = #state{img=Img},img,_URL,{ok,Info}) ->
         State#state{img = Img + Info};
collect_info(State = #state{script=Script},script,_URL,{ok,Info}) ->
         State#state{script = Script + Info};
collect_info(State = #state{errors=Errors},_Type,URL,{error,Reason}) ->
         State#state{errors=[{URL,Reason}|Errors]}.

%messages from workers
wait_for_responses(State,0) ->
    finalize(State,0);

wait_for_responses(State,Counter) ->
    receive
      {component,Type,URL,Info} ->
          wait_for_responses(collect_info(State,Type,URL,Info),Counter - 1);
      timeout -> finalize(State,Counter)
    end.

%prepares variables for printing
 finalize(State,Left) ->
  PageSize =  State#state.page,
  ImgSize =  State#state.img,
  CssSize =  State#state.css, %maybe one day will work
  ScriptSize =  State#state.script,
  Errors =  State#state.errors,
  TRef =  State#state.timer,
  erlang:cancel_timer(TRef),
  printing(PageSize,ImgSize,CssSize,ScriptSize).

printing(PageSize,ImgSize,CssSize,ScriptSize)->
  io:format("html size: ~.2fkb~n",[PageSize/1024]),
  io:format("images size: ~.2fkb~n",[ImgSize/1024]),
  io:format("script size: ~.2fkb~n",[ScriptSize/1024]),
 % io:format("stylesheet size: ~.2fkb~n",[CssSize/1024]),
  {ok}.

2 个答案:

答案 0 :(得分:1)

问题在于功能:

spawn_workers(URLctx,Type,URLs) ->
  lists:foreach(fun (Url) -> spawn( fun () ->
                                    self() ! {component, Type,Url,get_info(URLctx,Url)}
                                    end)
              end, URLs).

self()在生成的进程中进行评估,因此它将响应发送给自己。在生成进程之前将self分配给变量:

spawn_workers(URLctx,Type,URLs) ->
  Pid = self(),
  lists:foreach(fun (Url) -> spawn( fun () ->
                                    Pid ! {component, Type,Url,get_info(URLctx,Url)}
                                    end)
              end, URLs).

答案 1 :(得分:1)

而不是显示错误的位置,我将向您展示如何使用dbg调试它,这是Erlang调试器。用这些命令启动它:

dbg:tracer(). #start the process
dbg:p(all, c). #match all calls in patterns given later
ShowReturnedResults = [{'_', [], [{return_trace}]}] #find this magic in the docs
dbg:tpl(test, get_info, '_', ShowReturnedResults).
test:page_info("http://www.lambdadays.org").

这将告诉您,每个图片都调用get_info/2并返回一些结果。 所以问题必须是收集结果,让我们检查wait_for_responses/2

dbg:stop_clear(). #clears all traces
dbg:tracer().
dbg:p(all, c).
ShowReturnedResults = [{'_', [], [{return_trace}]}] #find this magic in the docs
dbg:tpl(test, wait_for_responses, '_', ShowReturnedResults).
test:page_info("http://www.lambdadays.org").

糟糕。它只被调用一次。这意味着,它达到了超时。让我们看看在此调用期间发送的消息。因为io:format发送了大量消息,所以我们可以在另一个进程中生成该函数。

Pid = spawn(fun() -> test:page_info("http://www.lambdadays.org") end),
dbg:p(Pid, [sos, m]). #print all messages, sent and received by this process and processes, that it spawned.

你应该收到很多消息,但我们只对返回的元组感兴趣:{component,img...},所以你可以找到这样的东西:

(<0.200.0>) <0.200.0> ! {component,img,
                               "/static/upload/media/1407924850920422agh.png",
                               {ok,189930}}
(<0.200.0>) << {component,img,"/static/upload/media/1407924850920422agh.png",
                      {ok,189930}}
(<0.199.0>) <0.199.0> ! {component,img,
                               "/static/upload/media/1407659467205755logo_glowna.png",
                               {ok,6424}}
(<0.199.0>) << {component,img,
                      "/static/upload/media/1407659467205755logo_glowna.png",
                      {ok,6424}}

您可以将其读作:

  • 进程0.200.0发送到结果0.200.0结果
  • 进程0.200.0将此结果收到其邮箱
  • 进程0.199.0向自己发送消息并接收消息。

但他们为什么要把它发给自己呢?

%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
  lists:foreach(fun (Url) -> spawn( fun () ->
                                self() ! {component, Type,Url,get_info(URLctx,Url)}
                                end)
              end, URLs).

内部fun在新生成的进程的上下文中进行评估,因此进程将消息发送给自己,而不是父进程。您必须在父级上下文中评估self()并将其传递给变量。

%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
  Parent = self(),
  lists:foreach(fun (Url) -> spawn( fun () ->
                                Parent ! {component, Type,Url,get_info(URLctx,Url)}
                                end)
              end, URLs).

dbg documentation