我需要解析谷歌自定义搜索引擎的结果。我的第一个问题是它全部是在javascript中。下面的页面加载要解析的结果,在js弹出窗口中打开。
<script>
function gcseCallback() {
if (document.readyState != 'complete')
return google.setOnLoadCallback(gcseCallback, true);
google.search.cse.element.render({gname:'gsearch', div:'results', tag:'searchresults-only', attributes:{linkTarget:''}});
var element = google.search.cse.element.getElement('gsearch');
element.execute('lectures');
};
window.__gcse = {
parsetags: 'explicit',
callback: gcseCallback
};
(function() {
var cx = 'xxxxxx:xxxxxxx';
var gcse = document.createElement('script');
gcse.type = 'text/javascript';
gcse.async = true;
gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') +
'//www.google.com/cse/cse.js?cx=' + cx;
var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(gcse, s);
})();
</script>
<div id="results"></div>
我已经尝试过但没有成功。 硒 Jsoup 的HtmlUnit
他们永远不会加载结果。我知道如果我等待,它将加载JS,但谷歌自定义搜索引擎不是这种情况。 div id = results中的数据永远不会加载到上述任何一个中。诸如css,js页面的数据调用加载但不是实际结果。我需要在java中这样做。有更好的方法吗?
是否有可能强制页面直接加载html而不加载任何javascript?如果这是在HTML中,当然,它会容易得多。也许有一种方法在javascript加载后转换为html?
Selenium示例
package raTesting;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
public class Testing {
public static void main(String[] args)
{
WebDriver driver = new HtmlUnitDriver(BrowserVersion.CHROME);
driver.get("https://www.google.com/cse/publicurl?q=breaking&cx=005766509181136893168:j_finnh-2pi");
System.out.println(driver.getPageSource());
}
当网址加载时,会显示需要扫描的所有结果。但来源永远不会带来任何结果。
答案 0 :(得分:1)
对于任何仍在寻找的人。更改下面的代码以满足您的需求。您将过程放入方法并在函数check()中运行。函数内部的任何内容都将循环,直到它循环数组。
* 已知问题:* capserjs运行速度比google js快。结果是重复的链接。我还没能告诉casperjs等待google js popup先关闭。
var casper = require("casper").create({
verbose: true
});
url = casper.cli.get(0)
// The base links array
var links = [
url
];
// If we don't set a limit, it could go on forever
var upTo = ~~casper.cli.get(0) || 10;
var currentLink = 0;
// Get the links, and add them to the links array
// (It could be done all in one step, but it is intentionally splitted)
function addLinks(link) {
this.then(function() {
var found = this.evaluate(searchLinks);
this.echo(found.length + " links found on " + link);
links = links.concat(found);
});
}
// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'
function searchLinks() {
var filter, map;
filter = Array.prototype.filter;
map = Array.prototype.map;
return map.call(filter.call(document.querySelectorAll("a"), function(a) {
return (/^http:\/\/.*/i).test(a.getAttribute("href"));
}), function(a) {
return a.getAttribute("href");
});
}
// Just opens the page and prints the title
function start(link) {
this.start(link, function() {
this.echo('Page title: ' + this.getTitle());
});
}
// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
if (links[currentLink] && currentLink < upTo) {
this.echo('--- Link ' + currentLink + ' ---');
start.call(this, links[currentLink]);
addLinks.call(this, links[currentLink]);
currentLink++;
this.run(check);
} else {
this.echo("All done.");
this.exit();
}
}
casper.start().then(function() {
this.echo("Starting");
});
casper.run(check);
src:http://code.ohloh.net/file?fid=VzTcq4GkQhozuKWkprFfBghgXy4&cid=ZDmcCGgIq6k&s=&fp=513476&mp&projSelected=true#L0