我正在尝试抓取Google Patents DB。我的问题是HTMLUnit没有得到完整的页面。它似乎只到达“登录”按钮并结束。
我正在尝试解析以下Website。
我的代码“currentpage.asText()”仅获取: 谷歌专利 登录
任何人都可以帮我或者有类似的问题吗?
public class HTMLUnitGoogle3 {
@Test
public void patentPage() throws ParseException, InterruptedException, IOException{
HashMap<String, String> patente = new HashMap<String,String>();
// Abfragezeitraum
DateFormat df = new SimpleDateFormat("yyyyMMdd");
// Daten
Date startDate = df.parse("19870101");
Date endDate = df.parse("19880101");
String unternehmen = "Mannesmann";
Calendar c = Calendar.getInstance();
c.setTime(startDate);
c.add(Calendar.MONTH, 3);
Date tempDate = c.getTime();
System.out.println("Vor der While - Datum: " + df.format(startDate) + " " + df.format(tempDate) + " " + df.format(endDate));
// Erstelle eine Liste mit Links zu den Patenten auf der Seite
ArrayList<String> ergebnisList = new ArrayList<String>();
while(endDate.after(tempDate) || endDate.equals(tempDate)){
// Aufbau der Umgebung
System.out.println("Datum: " + df.format(startDate) + " " + df.format(tempDate) + " " + df.format(endDate));
int seitenzaehler = 0;
String seite = "";
boolean weitereSeite = true;
while(weitereSeite){
// Aufruf der Seite
WebClient webClient = new WebClient(BrowserVersion.INTERNET_EXPLORER);
HtmlPage currentPage = seitenAufbau(webClient, df.format(startDate), df.format(tempDate), unternehmen, seite);
System.out.println(currentPage.asText());
listAllDivAnchors(currentPage);
List<?> ws = currentPage.getByXPath("//search-results");
for(int i = 0; i<ws.size();i++){
System.out.println(ws.get(i).toString());
}
HtmlHeading1 h1Title = (HtmlHeading1) currentPage.getElementById("title");
System.out.println("Seite = " + h1Title.asText());
HtmlAnchor weiter = currentPage.getHtmlElementById("nextResult");
currentPage.cleanUp();
webClient.close();
webClient = null;
}
// Addiere 3 Monate auf start und endDate
startDate = tempDate;
Calendar c1 = Calendar.getInstance();
c1.setTime(tempDate);
c1.add(Calendar.MONTH, 3);
tempDate = c1.getTime();
System.out.println("Neues Datum: " + df.format(startDate) + " " + df.format(tempDate) + " " + df.format(endDate));
}
}
private void listAllDivAnchors(HtmlPage currentPage) {
// TODO Auto-generated method stub
List<?> divs = currentPage.getByXPath("//div");
System.out.println("Divs Size: " + divs.size());
for(int i = 0; i<divs.size();i++){
System.out.println(divs.get(i).toString());
}
List<?> ii = currentPage.getByXPath("//a");
for(int i = 0; i<ii.size();i++){
System.out.println(ii.get(i).toString());
}
List<?> i2 = currentPage.getByXPath("//input");
for(int i = 0; i<i2.size();i++){
System.out.println(i2.get(i).toString());
}
}
public HtmlPage seitenAufbau(WebClient webClient,String start , String ende, String unternehmen, String seite) throws InterruptedException{
HtmlPage page = null;
String link = "https://patents.google.com/?"+
"assignee="+ unternehmen
+"&before="+ ende +
"&after=priority:" + start +
"&clustered=false"
+ "&sort=old"
+ seite;
System.out.println(link);
try {
webClient.getOptions().setThrowExceptionOnScriptError(false);
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.waitForBackgroundJavaScript(80000);
page = webClient.getPage(link);
} catch (FailingHttpStatusCodeException e) {
e.printStackTrace();
System.out.println(e.getMessage());
} catch (MalformedURLException e) {
System.out.println(e.getMessage());
e.printStackTrace();
} catch (IOException e) {
System.out.println("ex");
System.out.println(e.getMessage());
e.printStackTrace();
}
JavaScriptJobManager manager = page.getEnclosingWindow().getJobManager();
while (manager.getJobCount() > 0) {
Thread.sleep(1000);
}
return page;
}
如果我删除了throwExceptionOnScriptError,我会得到HTMLUnit错误:
======= EXCEPTION START ========
EcmaError: lineNumber=[11] column=[0] lineSource=[<no source>] name=[TypeError] sourceName=[gstatic.com/patent-search/frontend/patent-search.search_20160829_RC02/scs/compiled_dir/webcomponentsjs/webcomponents-lite.min.js] message=[TypeError: Expected argument of type object, but instead had type object (...