我实际上是想点击一个链接来下载文件:
我试图下载的行的html代码是:
<a id="a_file" title="Download the zip data file" href="javascript:return true;" target="nullDisplay">HISTDATA_COM_MT_EURUSD_M1_2013.zip</a>
java代码是:
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
HtmlPage htmlPage=webClient.getPage("http://www.histdata.com/download-free-forex-historical-data/?/metatrader/1-minute-bar-quotes/eurusd/2016/7");
List<HtmlAnchor> anchors=htmlPage.getAnchors();
HtmlAnchor anchor = null;
for (int i = 0; i < anchors.size(); ++i) {
anchor = anchors.get(i);
String sAnchor = anchor.asText();
if (sAnchor.equals("HISTDATA_COM_MT_EURUSD_M1_201607.zip"))
break;
}
Page p = anchor.click();
webClient.waitForBackgroundJavaScript(60000);
InputStream is = p.getWebResponse().getContentAsStream();
int b = 0;
while ((b = is.read()) != -1) {
System.out.print((char)b);
}
我得到的错误信息是:
Jul 12, 2016 1:29:57 PM com.gargoylesoftware.htmlunit.javascript.StrictErrorReporter error
SEVERE: error: message=[invalid return] sourceName=[javascript url] line=[88] lineSource=[return true;] lineOffset=[7]
Exception in thread "main" ======= EXCEPTION START ========
Exception class=[net.sourceforge.htmlunit.corejs.javascript.EvaluatorException]
com.gargoylesoftware.htmlunit.ScriptException: invalid return (javascript url#88)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:904)
at net.sourceforge.htmlunit.corejs.javascript.Context.call(Context.java:628)
at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.call(ContextFactory.java:515)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.compile(JavaScriptEngine.java:729)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.compile(JavaScriptEngine.java:694)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:746)
at com.gargoylesoftware.htmlunit.html.HtmlPage.executeJavaScriptIfPossible(HtmlPage.java:902)
at com.gargoylesoftware.htmlunit.html.HtmlAnchor.doClickStateUpdate(HtmlAnchor.java:114)
at com.gargoylesoftware.htmlunit.html.HtmlAnchor.doClickStateUpdate(HtmlAnchor.java:179)
at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:800)
at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:747)
at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:694)
at clickPage.main(clickPage.java:38)
Caused by: net.sourceforge.htmlunit.corejs.javascript.EvaluatorException: invalid return (javascript url#88)
at com.gargoylesoftware.htmlunit.javascript.StrictErrorReporter.error(StrictErrorReporter.java:65)
at net.sourceforge.htmlunit.corejs.javascript.Parser.addError(Parser.java:188)
at net.sourceforge.htmlunit.corejs.javascript.Parser.addError(Parser.java:167)
at net.sourceforge.htmlunit.corejs.javascript.Parser.reportError(Parser.java:255)
at net.sourceforge.htmlunit.corejs.javascript.Parser.reportError(Parser.java:244)
at net.sourceforge.htmlunit.corejs.javascript.Parser.reportError(Parser.java:237)
at net.sourceforge.htmlunit.corejs.javascript.Parser.returnOrYield(Parser.java:1632)
at net.sourceforge.htmlunit.corejs.javascript.Parser.statementHelper(Parser.java:1022)
at net.sourceforge.htmlunit.corejs.javascript.Parser.statement(Parser.java:928)
at net.sourceforge.htmlunit.corejs.javascript.Parser.parse(Parser.java:572)
at net.sourceforge.htmlunit.corejs.javascript.Parser.parse(Parser.java:492)
at net.sourceforge.htmlunit.corejs.javascript.Context.compileImpl(Context.java:2660)
at net.sourceforge.htmlunit.corejs.javascript.Context.compileString(Context.java:1623)
at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory$TimeoutContext.compileString(HtmlUnitContextFactory.java:172)
at net.sourceforge.htmlunit.corejs.javascript.Context.compileString(Context.java:1615)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$2.doRun(JavaScriptEngine.java:720)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:889)
... 12 more
Enclosed exception:
net.sourceforge.htmlunit.corejs.javascript.EvaluatorException: invalid return (javascript url#88)
at com.gargoylesoftware.htmlunit.javascript.StrictErrorReporter.error(StrictErrorReporter.java:65)
at net.sourceforge.htmlunit.corejs.javascript.Parser.addError(Parser.java:188)
at net.sourceforge.htmlunit.corejs.javascript.Parser.addError(Parser.java:167)
at net.sourceforge.htmlunit.corejs.javascript.Parser.reportError(Parser.java:255)
at net.sourceforge.htmlunit.corejs.javascript.Parser.reportError(Parser.java:244)
at net.sourceforge.htmlunit.corejs.javascript.Parser.reportError(Parser.java:237)
at net.sourceforge.htmlunit.corejs.javascript.Parser.returnOrYield(Parser.java:1632)
at net.sourceforge.htmlunit.corejs.javascript.Parser.statementHelper(Parser.java:1022)
at net.sourceforge.htmlunit.corejs.javascript.Parser.statement(Parser.java:928)
at net.sourceforge.htmlunit.corejs.javascript.Parser.parse(Parser.java:572)
at net.sourceforge.htmlunit.corejs.javascript.Parser.parse(Parser.java:492)
at net.sourceforge.htmlunit.corejs.javascript.Context.compileImpl(Context.java:2660)
at net.sourceforge.htmlunit.corejs.javascript.Context.compileString(Context.java:1623)
at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory$TimeoutContext.compileString(HtmlUnitContextFactory.java:172)
at net.sourceforge.htmlunit.corejs.javascript.Context.compileString(Context.java:1615)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$2.doRun(JavaScriptEngine.java:720)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:889)
at net.sourceforge.htmlunit.corejs.javascript.Context.call(Context.java:628)
at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.call(ContextFactory.java:515)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.compile(JavaScriptEngine.java:729)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.compile(JavaScriptEngine.java:694)
at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:746)
at com.gargoylesoftware.htmlunit.html.HtmlPage.executeJavaScriptIfPossible(HtmlPage.java:902)
at com.gargoylesoftware.htmlunit.html.HtmlAnchor.doClickStateUpdate(HtmlAnchor.java:114)
at com.gargoylesoftware.htmlunit.html.HtmlAnchor.doClickStateUpdate(HtmlAnchor.java:179)
at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:800)
at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:747)
at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:694)
at clickPage.main(clickPage.java:38)
== CALLING JAVASCRIPT ==
return true;
======= EXCEPTION END ========
请告诉我代码中的错误以及如何从给定链接下载文件。
答案 0 :(得分:1)
//Complete solution
//1. open page
//2. list the urls of that page using xpath
//3. download all file of that url .
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Date;
import java.util.List;
import java.util.Map;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomAttr;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class Crawler {
public static void main(String[] args) throws Throwable {
String baseUrl= "Enter base http/https url here";
String url1 = baseUrl+ "add addational url of main page";
String xpathofdownlaodlinks = "xpath of file url or--> html/body/div/div[3]/a/@href";
String pathToSaveFile="d:\\local\\to\\save\\files";
String fileExt = ".txt";
WebClient webclient = new WebClient(BrowserVersion.CHROME);
webclient.getOptions().setJavaScriptEnabled(true);
HtmlPage page = webclient.getPage(url1);
List<DomAttr> links = (List<DomAttr>) page.getByXPath(xpathofdownlaodlinks);
List<HtmlAnchor> anchors=page.getAnchors();
for (DomAttr object : links) {
String link = baseUrl+object.getValue()+"";
Date d=new Date();
downlaodRawFile(link,pathToSaveFile +d.getTime() + fileExt);
}
webclient.close();
}
public static void downlaodRawFile(String link,String fileName) throws IOException, Throwable{
URL url = new URL( link );
HttpURLConnection http = (HttpURLConnection)url.openConnection();
Map< String, List< String >> header = http.getHeaderFields();
while( isRedirected( header )) {
link = header.get( "Location" ).get( 0 );
url = new URL( link );
http = (HttpURLConnection)url.openConnection();
header = http.getHeaderFields();
}
InputStream input = http.getInputStream();
byte[] buffer = new byte[4096];
int n = -1;
OutputStream output = new FileOutputStream( new File( fileName ));
while ((n = input.read(buffer)) != -1) {
output.write( buffer, 0, n );
}
output.close();
}
private static boolean isRedirected( Map<String, List<String>> header ) {
for( String hv : header.get( null )) {
if( hv.contains( " 301 " )
|| hv.contains( " 302 " )) return true;
}
return false;
}
}
答案 1 :(得分:0)
感谢您的举报,错误现已修复为SVN。
请使用latest build或快照。