我正在尝试阅读网页的html。我认为有些字段是由脚本填充的,因为我可以在浏览器中完全看到它们,但是抓取网页的内容是空的。 当我尝试使用HtmlUnit和下一个代码时,我在日志中得到一个大的异常,我不知道如何解决它。
这是我的代码:
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class HtmlUnitTest {
public static void main(String[] args) {
/* turn off annoying htmlunit warnings */
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(java.util.logging.Level.OFF);
String searchQuery = "William Hill" ;
WebClient client = new WebClient(BrowserVersion.CHROME);
client.getOptions().setCssEnabled(false);
client.getOptions().setJavaScriptEnabled(true);
try {
String searchUrl = "http://sports.williamhill.es/bet_esp/es/betting/t/338/LaLiga.html" + URLEncoder.encode(searchQuery, "UTF-8");
HtmlPage page = client.getPage(searchUrl);
client.waitForBackgroundJavaScriptStartingBefore(10000);
final DomNodeList<DomNode> divs = page.querySelectorAll("tr.rowOdd");
for (DomNode div : divs) {
System.out.println(div.asXml());
}
}catch(Exception e){
e.printStackTrace();
}
}
}
以下是我得到的例外情况:
======= EXCEPTION START ========
EcmaError: lineNumber=[471] column=[0] lineSource=[<no source>] name=[TypeError] sourceName=[http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1] message=[TypeError: Cannot call method "replace" of undefined (http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1#471)]
com.gargoylesoftware.htmlunit.ScriptException: TypeError: Cannot call method "replace" of undefined (http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1#471)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:914)
at net.sourceforge.htmlunit.corejs.javascript.Context.call(Context.java:599)
at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.call(ContextFactory.java:527)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:794)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:770)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:761)
at com.gargoylesoftware.htmlunit.html.HtmlPage.executeJavaScript(HtmlPage.java:919)
at com.gargoylesoftware.htmlunit.html.HtmlScript.executeInlineScriptIfNeeded(HtmlScript.java:316)
at com.gargoylesoftware.htmlunit.html.HtmlScript.executeScriptIfNeeded(HtmlScript.java:396)
at com.gargoylesoftware.htmlunit.html.HtmlScript$2.execute(HtmlScript.java:246)
at com.gargoylesoftware.htmlunit.html.HtmlScript.onAllChildrenAddedToPage(HtmlScript.java:267)
at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.endElement(HTMLParser.java:805)
at org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source)
at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.endElement(HTMLParser.java:761)
at net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer.callEndElement(HTMLTagBalancer.java:1236)
at net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer.endElement(HTMLTagBalancer.java:1136)
at net.sourceforge.htmlunit.cyberneko.filters.DefaultFilter.endElement(DefaultFilter.java:226)
at net.sourceforge.htmlunit.cyberneko.filters.NamespaceBinder.endElement(NamespaceBinder.java:345)
at net.sourceforge.htmlunit.cyberneko.HTMLScanner$ContentScanner.scanEndElement(HTMLScanner.java:3189)
at net.sourceforge.htmlunit.cyberneko.HTMLScanner$ContentScanner.scan(HTMLScanner.java:2141)
at net.sourceforge.htmlunit.cyberneko.HTMLScanner.scanDocument(HTMLScanner.java:945)
at net.sourceforge.htmlunit.cyberneko.HTMLConfiguration.parse(HTMLConfiguration.java:521)
at net.sourceforge.htmlunit.cyberneko.HTMLConfiguration.parse(HTMLConfiguration.java:472)
at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.parse(HTMLParser.java:1004)
at com.gargoylesoftware.htmlunit.html.HTMLParser.parse(HTMLParser.java:253)
at com.gargoylesoftware.htmlunit.html.HTMLParser.parseHtml(HTMLParser.java:195)
at com.gargoylesoftware.htmlunit.DefaultPageCreator.createHtmlPage(DefaultPageCreator.java:267)
at com.gargoylesoftware.htmlunit.DefaultPageCreator.createPage(DefaultPageCreator.java:158)
at com.gargoylesoftware.htmlunit.WebClient.loadWebResponseInto(WebClient.java:529)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:398)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:315)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:463)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:448)
at HtmlUnitTest.main(HtmlUnitTest.java:25)
Caused by: net.sourceforge.htmlunit.corejs.javascript.EcmaError: TypeError: Cannot call method "replace" of undefined (http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1#471)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.constructError(ScriptRuntime.java:4130)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.constructError(ScriptRuntime.java:4108)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.typeError(ScriptRuntime.java:4141)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.typeError2(ScriptRuntime.java:4160)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.undefCallError(ScriptRuntime.java:4179)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.getPropFunctionAndThisHelper(ScriptRuntime.java:2509)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.getPropFunctionAndThis(ScriptRuntime.java:2502)
at net.sourceforge.htmlunit.corejs.javascript.Interpreter.interpretLoop(Interpreter.java:1327)
at net.sourceforge.htmlunit.corejs.javascript.Interpreter.interpret(Interpreter.java:815)
at net.sourceforge.htmlunit.corejs.javascript.InterpretedFunction.call(InterpretedFunction.java:111)
at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.doTopCall(ContextFactory.java:417)
at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory.doTopCall(HtmlUnitContextFactory.java:325)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.doTopCall(ScriptRuntime.java:3424)
at net.sourceforge.htmlunit.corejs.javascript.InterpretedFunction.exec(InterpretedFunction.java:122)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$3.doRun(JavaScriptEngine.java:785)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:899)
... 34 more
Enclosed exception:
net.sourceforge.htmlunit.corejs.javascript.EcmaError: TypeError: Cannot call method "replace" of undefined (http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1#471)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.constructError(ScriptRuntime.java:4130)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.constructError(ScriptRuntime.java:4108)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.typeError(ScriptRuntime.java:4141)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.typeError2(ScriptRuntime.java:4160)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.undefCallError(ScriptRuntime.java:4179)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.getPropFunctionAndThisHelper(ScriptRuntime.java:2509)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.getPropFunctionAndThis(ScriptRuntime.java:2502)
at net.sourceforge.htmlunit.corejs.javascript.Interpreter.interpretLoop(Interpreter.java:1327)
at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:471)
at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:377)
at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:379)
at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:397)
at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:415)
at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:418)
at script(script in http://sports.williamhill.es/bet_esp/es/betting/t/338/LaLiga.htmlWilliam+Hill from (1174, 34) to (1192, 12):1176)
at net.sourceforge.htmlunit.corejs.javascript.Interpreter.interpret(Interpreter.java:815)
at net.sourceforge.htmlunit.corejs.javascript.InterpretedFunction.call(InterpretedFunction.java:111)
at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.doTopCall(ContextFactory.java:417)
at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory.doTopCall(HtmlUnitContextFactory.java:325)
at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.doTopCall(ScriptRuntime.java:3424)
at net.sourceforge.htmlunit.corejs.javascript.InterpretedFunction.exec(InterpretedFunction.java:122)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$3.doRun(JavaScriptEngine.java:785)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:899)
at net.sourceforge.htmlunit.corejs.javascript.Context.call(Context.java:599)
at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.call(ContextFactory.java:527)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:794)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:770)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:761)
at com.gargoylesoftware.htmlunit.html.HtmlPage.executeJavaScript(HtmlPage.java:919)
at com.gargoylesoftware.htmlunit.html.HtmlScript.executeInlineScriptIfNeeded(HtmlScript.java:316)
at com.gargoylesoftware.htmlunit.html.HtmlScript.executeScriptIfNeeded(HtmlScript.java:396)
at com.gargoylesoftware.htmlunit.html.HtmlScript$2.execute(HtmlScript.java:246)
at com.gargoylesoftware.htmlunit.html.HtmlScript.onAllChildrenAddedToPage(HtmlScript.java:267)
at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.endElement(HTMLParser.java:805)
at org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source)
at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.endElement(HTMLParser.java:761)
at net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer.callEndElement(HTMLTagBalancer.java:1236)
at net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer.endElement(HTMLTagBalancer.java:1136)
at net.sourceforge.htmlunit.cyberneko.filters.DefaultFilter.endElement(DefaultFilter.java:226)
at net.sourceforge.htmlunit.cyberneko.filters.NamespaceBinder.endElement(NamespaceBinder.java:345)
at net.sourceforge.htmlunit.cyberneko.HTMLScanner$ContentScanner.scanEndElement(HTMLScanner.java:3189)
at net.sourceforge.htmlunit.cyberneko.HTMLScanner$ContentScanner.scan(HTMLScanner.java:2141)
at net.sourceforge.htmlunit.cyberneko.HTMLScanner.scanDocument(HTMLScanner.java:945)
at net.sourceforge.htmlunit.cyberneko.HTMLConfiguration.parse(HTMLConfiguration.java:521)
at net.sourceforge.htmlunit.cyberneko.HTMLConfiguration.parse(HTMLConfiguration.java:472)
at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.parse(HTMLParser.java:1004)
at com.gargoylesoftware.htmlunit.html.HTMLParser.parse(HTMLParser.java:253)
at com.gargoylesoftware.htmlunit.html.HTMLParser.parseHtml(HTMLParser.java:195)
at com.gargoylesoftware.htmlunit.DefaultPageCreator.createHtmlPage(DefaultPageCreator.java:267)
at com.gargoylesoftware.htmlunit.DefaultPageCreator.createPage(DefaultPageCreator.java:158)
at com.gargoylesoftware.htmlunit.WebClient.loadWebResponseInto(WebClient.java:529)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:398)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:315)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:463)
at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:448)
at HtmlUnitTest.main(HtmlUnitTest.java:25)
======= EXCEPTION END ========
答案 0 :(得分:0)
第一次,通过启用选项
告诉webclient不要为脚本错误抛出异常 client.getOptions().setThrowExceptionOnScriptError(false);