这是我使用无头浏览器幻像js进行网页抓取的代码
DesiredCapabilities capabilities = new DesiredCapabilities();
capabilities.setJavascriptEnabled(true);
File file = new File("src/main/resources/phantomjs.exe");
String absolutePath = file.getAbsolutePath();
// capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,
// "C:/Users/admin/Desktop/phantomjs-2.1.1-windows/bin/phantomjs.exe");
capabilities.setCapability(
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,
absolutePath);
WebDriver driver = new PhantomJSDriver(capabilities);
driver.get("http://blah.blah.com/players/?ls=iref:nba:gnav");
哪个工作正常,后来我从那里带了一些链接并进行下一轮抓取
if (listOfPyalerLinks.size() > 0) {
for (String url : listOfPyalerLinks) {
try {
Player p = new Player();
driver.get(url);
WebElement position = driver.findElement(By
.xpath("//*[contains(@class, 'num-position')]"));
String positionvalstr = position.getText();
String positionval = positionvalstr.split("\\|")[1];
p.setPosition(positionval);
WebElement tabstats = driver.findElement(By
.xpath("//*[contains(@id, 'tab-career')]"));
循环继续运行并抓取数据。但经过一段时间15或20分钟我收到错误
org.openqa.selenium.remote.UnreachableBrowserException: Error communicating with the remote browser. It may have died.
Build info: version: '2.39.0', revision: '14fa800511cc5d66d426e08b0b2ab926c7ed7398', time: '2013-12-16 13:18:38'
System info: host: 'Sparity', ip: '172.28.220.143', os.name: 'Windows 8', os.arch: 'amd64', os.version: '6.2', java.version: '1.7.0_51'
Driver info: driver.version: RemoteWebDriver
at org.openqa.selenium.remote.RemoteWebDriver.execute(RemoteWebDriver.java:548)
at org.openqa.selenium.remote.RemoteWebDriver.get(RemoteWebDriver.java:276)
at com.nbaparser.scrapper.service.impl.CrawlServiceImpl.parseDataFromLinks(CrawlServiceImpl.java:101)
at com.nbaparser.scrapper.service.impl.CrawlServiceImpl.getNBAPlayerslinksList(CrawlServiceImpl.java:82)
at com.nbaparser.test.CrawlTestCase.seleniumWithPanthamJSDriverTestCase(CrawlTestCase.java:70)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:45)
at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:15)
at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:42)
at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:20)
at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:263)
at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:68)
at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:47)
at org.junit.runners.ParentRunner$3.run(ParentRunner.java:231)
at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:60)
at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:229)
at org.junit.runners.ParentRunner.access$000(ParentRunner.java:50)
at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:222)
at org.junit.runners.ParentRunner.run(ParentRunner.java:300)
at org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:50)
at org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:459)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:675)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:382)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:192)
Caused by: org.openqa.selenium.WebDriverException: java.net.ConnectException: Connection refused: connect
Build info: version: '2.39.0', revision: '14fa800511cc5d66d426e08b0b2ab926c7ed7398', time: '2013-12-16 13:18:38'
System info: host: 'Sparity', ip: '172.28.220.143', os.name: 'Windows 8', os.arch: 'amd64', os.version: '6.2', java.version: '1.7.0_51'
Driver info: driver.version: RemoteWebDriver
at org.openqa.selenium.phantomjs.PhantomJSCommandExecutor.execute(PhantomJSCommandExecutor.java:91)
at org.openqa.selenium.remote.RemoteWebDriver.execute(RemoteWebDriver.java:527)
... 27 more
Caused by: java.net.ConnectException: Connection refused: connect
at java.net.DualStackPlainSocketImpl.waitForConnect(Native Method)
at java.net.DualStackPlainSocketImpl.socketConnect(DualStackPlainSocketImpl.java:85)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:339)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:200)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:182)
at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:172)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:579)
at org.apache.http.conn.scheme.PlainSocketFactory.connectSocket(PlainSocketFactory.java:117)
at org.apache.http.impl.conn.DefaultClientConnectionOperator.openConnection(DefaultClientConnectionOperator.java:177)
at org.apache.http.impl.conn.AbstractPoolEntry.open(AbstractPoolEntry.java:144)
at org.apache.http.impl.conn.AbstractPooledConnAdapter.open(AbstractPooledConnAdapter.java:131)
at org.apache.http.impl.client.DefaultRequestDirector.tryConnect(DefaultRequestDirector.java:611)
at org.apache.http.impl.client.DefaultRequestDirector.execute(DefaultRequestDirector.java:446)
at org.apache.http.impl.client.AbstractHttpClient.doExecute(AbstractHttpClient.java:882)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:71)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:55)
at org.openqa.selenium.remote.HttpCommandExecutor.fallBackExecute(HttpCommandExecutor.java:319)
at org.openqa.selenium.remote.HttpCommandExecutor.execute(HttpCommandExecutor.java:298)
at org.openqa.selenium.phantomjs.PhantomJSCommandExecutor.execute(PhantomJSCommandExecutor.java:82)
... 28 more
可能的原因是什么?