gwt应用程序的爬虫需要花费太多时间

时间:2013-10-31 20:02:12

标签: java gwt seo web-crawler htmlunit

我有一个gwt应用程序,我需要优化seo(抓取谷歌的内容),我一直在尝试许多不符合我们需求的解决方案(这需要我们花费大量时间来返回html页面),试验是:

  1. 我尝试使用htmlUnit作为无头浏览器按需抓取页面,获取html内容需要大约15秒(审核此时间时,结果是80%的时间是由等待的循环占用的后台javascript“while(waitForBackgroundJavaScript> 0&& loopCount< _maxLoopChecks)”)
  2. 一项技术,包括在谷歌请求之前抓取页面,然后在谷歌要求时提供保存的快照(但这个解决方案绝对不方便,因为内容变化非常频繁,谷歌可能会认为这是“CLOACKING” “)
  3. 有什么建议吗?

    用于抓取的代码:

    public class CrawlFilter implements Filter {
        private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
            private static final long serialVersionUID = 1L;
    
        @Override
        public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
            return true;
        }
    }
    
    private final Logger log = Logger.getLogger(CrawlFilter.class.getName());
    
    /**
     * Special URL token that gets passed from the crawler to the servlet
     * filter. This token is used in case there are already existing query
     * parameters.
     */
    private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_=";
    private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length();
    /**
     * Special URL token that gets passed from the crawler to the servlet
     * filter. This token is used in case there are not already existing query
     * parameters.
     */
    private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1;
    private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length();
    
    private static final long _pumpEventLoopTimeoutMillis = 30000;
    private static final long _jsTimeoutMillis = 1000;
    private static final long _pageWaitMillis = 200;
    private static final int _maxLoopChecks = 2;
    
    private WebClient webClient;
    
    public void doFilter(ServletRequest request, ServletResponse response,
                         FilterChain filterChain) throws IOException, ServletException {
        // Grab the request uri and query strings.
        final HttpServletRequest httpRequest = (HttpServletRequest) request;
        final String requestURI = httpRequest.getRequestURI();
        final String queryString = httpRequest.getQueryString();
        final HttpServletResponse httpResponse = (HttpServletResponse) response;
    
        if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) {
            final int port = httpRequest.getServerPort();
            final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString);
            final String scheme = httpRequest.getScheme();
            final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment);
            final WebRequest webRequest = new WebRequest(urlWithHashFragment);
    
            log.fine("Crawl filter encountered escaped fragment, will open: " + webRequest.toString());
    
            httpResponse.setContentType("text/html;charset=UTF-8");
            final PrintWriter out = httpResponse.getWriter();
            out.println(renderPage(webRequest));
            out.flush();
            out.close();
    
            log.fine("HtmlUnit completed webClient.getPage(webRequest) where webRequest = " + webRequest.toString());
        } else {
            filterChain.doFilter(request, response);
        }
    }
    
    @Override
    public void destroy() {
        if (webClient != null) {
            webClient.closeAllWindows();
        }
    }
    
    @Override
    public void init(FilterConfig config) throws ServletException {
    }
    
    private StringBuilder renderPage(WebRequest webRequest) throws IOException {
        webClient = new WebClient(BrowserVersion.FIREFOX_17);
        webClient.getCache().clear();
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setJavaScriptEnabled(true);
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.getOptions().setRedirectEnabled(false);
        webClient.setAjaxController(new SyncAllAjaxController());
        webClient.setCssErrorHandler(new SilentCssErrorHandler());
    
        final HtmlPage page = webClient.getPage(webRequest);
        webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis);
    
        int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
        int loopCount = 0;
    
        while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) {
            ++loopCount;
            waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
    
            if (waitForBackgroundJavaScript == 0) {
                log.fine("HtmlUnit exits background javascript at loop counter " + loopCount);
                break;
            }
    
            synchronized (page) {
                log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount);
                try {
                    page.wait(_pageWaitMillis);
                } catch (InterruptedException e) {
                    log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e);
                }
            }
        }
    
        webClient.getAjaxController().processSynchron(page, webRequest, false);
        if (webClient.getJavaScriptEngine().isScriptRunning()) {
            log.warning("HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()");
            webClient.getJavaScriptEngine().shutdownJavaScriptExecutor();
        }
    
        final String staticSnapshotHtml = page.asXml();
        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("<hr />\n");
        stringBuilder.append("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\"");
        stringBuilder.append(webRequest.getUrl() + "\">this link</a> for the interactive application.<br></h3></center>");
        stringBuilder.append("<hr />");
        stringBuilder.append(staticSnapshotHtml);
    
        return stringBuilder;
    }
    
    /**
     * Maps from the query string that contains _escaped_fragment_ to one that
     * doesn't, but is instead followed by a hash fragment. It also unescapes any
     * characters that were escaped by the crawler. If the query string does not
     * contain _escaped_fragment_, it is not modified.
     *
     * @param queryString
     * @return A modified query string followed by a hash fragment if applicable.
     *         The non-modified query string otherwise.
     * @throws UnsupportedEncodingException
     */
    private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException {
        int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2);
        int length = ESCAPED_FRAGMENT_LENGTH2;
    
        if (index == -1) {
            index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1);
            length = ESCAPED_FRAGMENT_LENGTH1;
        }
    
        if (index != -1) {
            StringBuilder queryStringSb = new StringBuilder();
            if (index > 0) {
                queryStringSb.append("?");
                queryStringSb.append(queryString.substring(0, index));
            }
            queryStringSb.append("#!");
            queryStringSb.append(URLDecoder.decode(queryString.substring(index
                    + length, queryString.length()), "UTF-8"));
            return queryStringSb.toString();
        }
    
        return queryString;
    }
    }
    

1 个答案:

答案 0 :(得分:0)

我建议让HtmlUnit离线生成静态html。您可以控制更新频率。

然后,让您的servlet过滤器拦截爬虫请求返回已生成的静态html。