当我改变链接时,即使结构相同,我也会得到不同的结果。
我是否有办法解析标签之外的文本数据或指定开始解析文本(即从br
标签到a
标签)
Document tunisie = Jsoup.connect("http://www.juricaf.org/arret/TUNISIE-COURDECASSATION-20060126-5216").get();
Document tunisie2 = Jsoup.connect("http://www.juricaf.org/arret/TUNISIE-COURDECASSATION-20051229-33412004").get();
// get the first div in class arret
Element arret = tunisie.select("div.arret").first();
// select h1 tag by its ID to get the title
String titre = arret.select("#titre").text();
System.out.println(titre);
// to get the text after h3 select h3 and go to next sibling
String numarret = arret.select("h3").first().nextSibling().toString();
System.out.println(numarret);
// select first br by its index; note first br has the index 0; and call nextSibling to get the text after the br tag
String numnor = arret.getElementsByTag("br").first().nextSibling().toString();
System.out.println(numnor);
// the same as above only with next index
String idurn = arret.getElementsByTag("br").get(1).nextSibling().toString();
System.out.println(idurn + "\n");
// Analyses
String analyses = arret.select("blockquote > h2").text();
System.out.println("Analyses : "+ analyses);
// Analyses text
String analyses_text = arret.select("blockquote > p").text();
System.out.println(analyses_text+ "\n");
// Parties
String parties_Demandeurs = arret.getElementsByTag("h3").get(2).nextSibling().toString();
String parties_t_Demandeurs = arret.select("a").get(4).ownText();
System.out.println(parties_Demandeurs + parties_t_Demandeurs);
String parties_Défendeurs = arret.getElementsByTag("br").get(3).nextSibling().toString();
String parties_t_Défendeurs = arret.select("a").get(5).ownText();
System.out.println(parties_Défendeurs + parties_t_Défendeurs + "\n");
// Text
StringBuilder sb = new StringBuilder();
sb.append(arret.select("span > p")).append("\n"); // build html string with line breaks
System.out.println(sb.toString().trim()+"\n");