解析谷歌自定义搜索引擎结果的最佳方法

时间:2014-03-19 20:16:19

标签: java selenium jsoup htmlunit

我需要解析谷歌自定义搜索引擎的结果。我的第一个问题是它全部是在javascript中。下面的页面加载要解析的结果,在js弹出窗口中打开。

<script>
function gcseCallback() {
  if (document.readyState != 'complete')
    return google.setOnLoadCallback(gcseCallback, true);
  google.search.cse.element.render({gname:'gsearch', div:'results', tag:'searchresults-only', attributes:{linkTarget:''}});
  var element = google.search.cse.element.getElement('gsearch');
  element.execute('lectures');
};
window.__gcse = {
  parsetags: 'explicit',
  callback: gcseCallback
};
(function() {
  var cx = 'xxxxxx:xxxxxxx';
  var gcse = document.createElement('script');
  gcse.type = 'text/javascript';
  gcse.async = true;
  gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') +
    '//www.google.com/cse/cse.js?cx=' + cx;
  var s = document.getElementsByTagName('script')[0];
  s.parentNode.insertBefore(gcse, s);

})();
</script>
<div id="results"></div>

我已经尝试过但没有成功。 硒 Jsoup 的HtmlUnit

他们永远不会加载结果。我知道如果我等待,它将加载JS,但谷歌自定义搜索引擎不是这种情况。 div id = results中的数据永远不会加载到上述任何一个中。诸如css,js页面的数据调用加载但不是实际结果。我需要在java中这样做。有更好的方法吗?

是否有可能强制页面直接加载html而不加载任何javascript?如果这是在HTML中,当然,它会容易得多。也许有一种方法在javascript加载后转换为html?

Selenium示例

package raTesting;

import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;

public class Testing {

    public static void main(String[] args)
    {
        WebDriver driver = new HtmlUnitDriver(BrowserVersion.CHROME);

        driver.get("https://www.google.com/cse/publicurl?q=breaking&cx=005766509181136893168:j_finnh-2pi");

        System.out.println(driver.getPageSource());

          }

当网址加载时,会显示需要扫描的所有结果。但来源永远不会带来任何结果。

1 个答案:

答案 0 :(得分:1)

对于任何仍在寻找的人。更改下面的代码以满足您的需求。您将过程放入方法并在函数check()中运行。函数内部的任何内容都将循环,直到它循环数组。

* 已知问题:* capserjs运行速度比google js快。结果是重复的链接。我还没能告诉casperjs等待google js popup先关闭。

var casper = require("casper").create({
    verbose: true
});
url = casper.cli.get(0)
// The base links array
var links = [
    url
];

// If we don't set a limit, it could go on forever
var upTo = ~~casper.cli.get(0) || 10;

var currentLink = 0;

// Get the links, and add them to the links array
// (It could be done all in one step, but it is intentionally splitted)
function addLinks(link) {
    this.then(function() {
        var found = this.evaluate(searchLinks);
        this.echo(found.length + " links found on " + link);
        links = links.concat(found);
    });
}

// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'
function searchLinks() {
    var filter, map;
    filter = Array.prototype.filter;
    map = Array.prototype.map;
    return map.call(filter.call(document.querySelectorAll("a"), function(a) {
        return (/^http:\/\/.*/i).test(a.getAttribute("href"));
    }), function(a) {
        return a.getAttribute("href");
    });
}

// Just opens the page and prints the title
function start(link) {
    this.start(link, function() {
        this.echo('Page title: ' + this.getTitle());
    });
}

// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
    if (links[currentLink] && currentLink < upTo) {
        this.echo('--- Link ' + currentLink + ' ---');
        start.call(this, links[currentLink]);
        addLinks.call(this, links[currentLink]);
        currentLink++;
        this.run(check);
    } else {
        this.echo("All done.");
        this.exit();
    }
}

casper.start().then(function() {
    this.echo("Starting");
});

casper.run(check);

src:http://code.ohloh.net/file?fid=VzTcq4GkQhozuKWkprFfBghgXy4&cid=ZDmcCGgIq6k&s=&fp=513476&mp&projSelected=true#L0