我有一个gwt应用程序,我需要优化seo(抓取谷歌的内容),我一直在尝试许多不符合我们需求的解决方案(这需要我们花费大量时间来返回html页面),试验是:
有什么建议吗?
用于抓取的代码:
public class CrawlFilter implements Filter {
private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
private static final long serialVersionUID = 1L;
@Override
public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
return true;
}
}
private final Logger log = Logger.getLogger(CrawlFilter.class.getName());
/**
* Special URL token that gets passed from the crawler to the servlet
* filter. This token is used in case there are already existing query
* parameters.
*/
private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_=";
private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length();
/**
* Special URL token that gets passed from the crawler to the servlet
* filter. This token is used in case there are not already existing query
* parameters.
*/
private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1;
private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length();
private static final long _pumpEventLoopTimeoutMillis = 30000;
private static final long _jsTimeoutMillis = 1000;
private static final long _pageWaitMillis = 200;
private static final int _maxLoopChecks = 2;
private WebClient webClient;
public void doFilter(ServletRequest request, ServletResponse response,
FilterChain filterChain) throws IOException, ServletException {
// Grab the request uri and query strings.
final HttpServletRequest httpRequest = (HttpServletRequest) request;
final String requestURI = httpRequest.getRequestURI();
final String queryString = httpRequest.getQueryString();
final HttpServletResponse httpResponse = (HttpServletResponse) response;
if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) {
final int port = httpRequest.getServerPort();
final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString);
final String scheme = httpRequest.getScheme();
final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment);
final WebRequest webRequest = new WebRequest(urlWithHashFragment);
log.fine("Crawl filter encountered escaped fragment, will open: " + webRequest.toString());
httpResponse.setContentType("text/html;charset=UTF-8");
final PrintWriter out = httpResponse.getWriter();
out.println(renderPage(webRequest));
out.flush();
out.close();
log.fine("HtmlUnit completed webClient.getPage(webRequest) where webRequest = " + webRequest.toString());
} else {
filterChain.doFilter(request, response);
}
}
@Override
public void destroy() {
if (webClient != null) {
webClient.closeAllWindows();
}
}
@Override
public void init(FilterConfig config) throws ServletException {
}
private StringBuilder renderPage(WebRequest webRequest) throws IOException {
webClient = new WebClient(BrowserVersion.FIREFOX_17);
webClient.getCache().clear();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setRedirectEnabled(false);
webClient.setAjaxController(new SyncAllAjaxController());
webClient.setCssErrorHandler(new SilentCssErrorHandler());
final HtmlPage page = webClient.getPage(webRequest);
webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis);
int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
int loopCount = 0;
while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) {
++loopCount;
waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
if (waitForBackgroundJavaScript == 0) {
log.fine("HtmlUnit exits background javascript at loop counter " + loopCount);
break;
}
synchronized (page) {
log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount);
try {
page.wait(_pageWaitMillis);
} catch (InterruptedException e) {
log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e);
}
}
}
webClient.getAjaxController().processSynchron(page, webRequest, false);
if (webClient.getJavaScriptEngine().isScriptRunning()) {
log.warning("HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()");
webClient.getJavaScriptEngine().shutdownJavaScriptExecutor();
}
final String staticSnapshotHtml = page.asXml();
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("<hr />\n");
stringBuilder.append("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\"");
stringBuilder.append(webRequest.getUrl() + "\">this link</a> for the interactive application.<br></h3></center>");
stringBuilder.append("<hr />");
stringBuilder.append(staticSnapshotHtml);
return stringBuilder;
}
/**
* Maps from the query string that contains _escaped_fragment_ to one that
* doesn't, but is instead followed by a hash fragment. It also unescapes any
* characters that were escaped by the crawler. If the query string does not
* contain _escaped_fragment_, it is not modified.
*
* @param queryString
* @return A modified query string followed by a hash fragment if applicable.
* The non-modified query string otherwise.
* @throws UnsupportedEncodingException
*/
private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException {
int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2);
int length = ESCAPED_FRAGMENT_LENGTH2;
if (index == -1) {
index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1);
length = ESCAPED_FRAGMENT_LENGTH1;
}
if (index != -1) {
StringBuilder queryStringSb = new StringBuilder();
if (index > 0) {
queryStringSb.append("?");
queryStringSb.append(queryString.substring(0, index));
}
queryStringSb.append("#!");
queryStringSb.append(URLDecoder.decode(queryString.substring(index
+ length, queryString.length()), "UTF-8"));
return queryStringSb.toString();
}
return queryString;
}
}
答案 0 :(得分:0)
我建议让HtmlUnit离线生成静态html。您可以控制更新频率。
然后,让您的servlet过滤器拦截爬虫请求返回已生成的静态html。