需要有关此表达式的帮助
"//tr[td[normalize-space(font) = '"+params[1]+"']]/td/font/text()"
我试图从这个HTML文档中获取信息
<table width="575" border="0" cellspacing="1" cellpadding="0">
<tr>
<td width="39" class="back1"><b class="texto4">CRN</b></td>
<td width="60" class="back1"><b class="texto4">Materia</b></td>
<td width="53" class="back1"><b class="texto4">Sección</b></td>
<td width="55" class="back1"><b class="texto4">Créditos</b></td>
<td width="156" class="back1"><b class="texto4">Título</b></td>
<td width="69" class="back1"><b class="texto4">Cupo</b></td>
<td width="57" class="back1"><b class="texto4">Inscritos</b></td>
<td width="77" class="back1"><b class="texto4">Disponible</b></td>
</tr>
<tr>
<td width="39"><font class="texto4">
10110 </font></td>
<td width="60"><font class="texto4">
IIND1000 </font></td>
<td width="53"><font class="texto4">
<div align="center">
1 </div></font></td>
<td width="55"><font class="texto4">
<div align="center">
3 </div>
</font></td>
<td width="156"><font class="texto4">
INTROD. INGEN. INDUSTRIAL </font></td>
<td width="69"><font class="texto4">
100 </font></td>
<td width="57"><font class="texto4">
100 </font></td>
<td width="77"><font class="texto4">
0 </font></td>
</tr>
</table>
如果我查找params1 = 10110,我想获得该tr标签中的每个td元素(10110,IIND1000,1,3,INTROD.INGEN.Industrial,100,100,0)。
Jtidy并没有很好地完成这项工作(it was having trouble with the spaces between font and div),所以我决定改用Jsoup。有人会碰巧知道如何在开始时转换那个Xpath表达式,以便它可以在Jsoup中使用吗?
到目前为止,我已设法获得此表达式:font.texto4:contains(10110)
,仅获得&#34; 10110&#34;。但是,我还没有办法让每个子节点的文本处于同一级别。
EDTI:我是Jsoup的一个菜鸟,但我尝试了更多的表情并检查结果。我发现如果我尝试这个表达式tr>td:contains(10110) font.texto4
,我会得到表格中每个元素的文本。我只想将其缩小到同一级别的tr节点集。
答案 0 :(得分:2)
可以用xpath和jsoup两种方式完成。考虑这个例子。
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class SibilingParse {
public static void main(String[] args) {
try {
String html = "<table width='575' border='0' cellspacing='1' cellpadding='0'>"
+ "<tr>"
+ "<td width='39'><font class='texto4'>10110</font></td>"
+ "<td width='60'><font class='texto4'>IIND1000</font></td>"
+ "<td width='53'><font class='texto4'><div align='center'>1</div></font></td>"
+ "<td width='55'><font class='texto4'><div align='center'>3</div></font></td>"
+ "<td width='156'><font class='texto4'>INTROD. INGEN. INDUSTRIAL</font></td>"
+ "<td width='69'><font class='texto4'>100</font></td>"
+ "<td width='57'><font class='texto4'>100</font></td>"
+ "<td width='77'><font class='texto4'>0</font></td>"
+ "</tr>"
+ "</table>";
//Xpath way
System.out.println("XPATH");
InputStream xmlStream = new ByteArrayInputStream(html.getBytes());
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = builderFactory.newDocumentBuilder();
Document xmlDocument = builder.parse(xmlStream);
XPath xPath = XPathFactory.newInstance().newXPath();
String expression = "/table/tr/td//*[text()='10110']//../following-sibling::td";
NodeList nodeList = (NodeList) xPath.compile(expression).evaluate(xmlDocument, XPathConstants.NODESET);
for (int i = 0; i < nodeList.getLength(); i++) {
System.out.println(nodeList.item(i).getFirstChild().getTextContent());
}
System.out.println();
// Jsoup way
org.jsoup.nodes.Document doc = Jsoup.parse(html);
Elements tds = doc.select("td:contains(10110)");
if(tds != null && tds.size() > 0){
for(Element td : tds.first().siblingElements()){
System.out.println(td.text());
}
}
} catch (ParserConfigurationException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (XPathExpressionException e) {
e.printStackTrace();
}
}
}
基于网址
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class SiblingJsoup {
public static void main(String[] args) {
try {
Document doc = Jsoup
.connect("http://registroapps.uniandes.edu.co/scripts/adm_con_horario1_joomla.php?depto=IIND")
.timeout(20000)
.get();
Elements tds = doc.select("font:containsOwn(10110)");
if (tds != null && tds.size() > 0) {
for (Element td : tds.parents().first().siblingElements()) {
System.out.println(td.text());
}
}
System.out.println("Done");
} catch (IOException e) {
e.printStackTrace();
}
}
}