使用Jsoup的XPath表达式

时间:2014-07-15 04:19:10

标签: java xpath html-parsing jsoup

需要有关此表达式的帮助

"//tr[td[normalize-space(font) = '"+params[1]+"']]/td/font/text()"

我试图从这个HTML文档中获取信息

<table width="575" border="0" cellspacing="1" cellpadding="0">
    <tr> 
      <td width="39" class="back1"><b class="texto4">CRN</b></td>
      <td width="60" class="back1"><b class="texto4">Materia</b></td>
      <td width="53" class="back1"><b class="texto4">Secci&oacute;n</b></td>
      <td width="55" class="back1"><b class="texto4">Cr&eacute;ditos</b></td>
      <td width="156" class="back1"><b class="texto4">T&iacute;tulo</b></td>
      <td width="69" class="back1"><b class="texto4">Cupo</b></td>
      <td width="57" class="back1"><b class="texto4">Inscritos</b></td>
      <td width="77" class="back1"><b class="texto4">Disponible</b></td>
    </tr>
    <tr> 
      <td width="39"><font class="texto4"> 
        10110                        </font></td>
      <td width="60"><font class="texto4"> 
        IIND1000                        </font></td>
      <td width="53"><font class="texto4"> 
      <div align="center">
        1                        </div></font></td>
      <td width="55"><font class="texto4"> 
        <div align="center">
        3                       </div>
        </font></td>
      <td width="156"><font class="texto4"> 
        INTROD. INGEN. INDUSTRIAL                        </font></td>
      <td width="69"><font class="texto4"> 
        100                        </font></td>
      <td width="57"><font class="texto4"> 
        100                        </font></td>
      <td width="77"><font class="texto4"> 
        0                        </font></td>
    </tr>
</table>

如果我查找params1 = 10110,我想获得该tr标签中的每个td元素(10110,IIND1000,1,3,INTROD.INGEN.Industrial,100,100,0)。

Jtidy并没有很好地完成这项工作(it was having trouble with the spaces between font and div),所以我决定改用Jsoup。有人会碰巧知道如何在开始时转换那个Xpath表达式,以便它可以在Jsoup中使用吗?

到目前为止,我已设法获得此表达式:font.texto4:contains(10110),仅获得&#34; 10110&#34;。但是,我还没有办法让每个子节点的文本处于同一级别。

EDTI:我是Jsoup的一个菜鸟,但我尝试了更多的表情并检查结果。我发现如果我尝试这个表达式tr>td:contains(10110) font.texto4,我会得到表格中每个元素的文本。我只想将其缩小到同一级别的tr节点集。

1 个答案:

答案 0 :(得分:2)

可以用xpath和jsoup两种方式完成。考虑这个例子。

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class SibilingParse {

    public static void main(String[] args) {
        try {
                String html = "<table width='575' border='0' cellspacing='1' cellpadding='0'>"
                                + "<tr>"
                                    + "<td width='39'><font class='texto4'>10110</font></td>"
                                    + "<td width='60'><font class='texto4'>IIND1000</font></td>"
                                    + "<td width='53'><font class='texto4'><div align='center'>1</div></font></td>"
                                    + "<td width='55'><font class='texto4'><div align='center'>3</div></font></td>"
                                    + "<td width='156'><font class='texto4'>INTROD. INGEN. INDUSTRIAL</font></td>"
                                    + "<td width='69'><font class='texto4'>100</font></td>"
                                    + "<td width='57'><font class='texto4'>100</font></td>"
                                    + "<td width='77'><font class='texto4'>0</font></td>"
                                + "</tr>"
                            + "</table>";

                //Xpath way
                System.out.println("XPATH");
                InputStream xmlStream = new ByteArrayInputStream(html.getBytes());
                DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
                DocumentBuilder builder = builderFactory.newDocumentBuilder();
                Document xmlDocument = builder.parse(xmlStream);
                XPath xPath =  XPathFactory.newInstance().newXPath();

                String expression = "/table/tr/td//*[text()='10110']//../following-sibling::td";
                NodeList nodeList = (NodeList) xPath.compile(expression).evaluate(xmlDocument, XPathConstants.NODESET);
                for (int i = 0; i < nodeList.getLength(); i++) {
                    System.out.println(nodeList.item(i).getFirstChild().getTextContent()); 
                }
                System.out.println();

                // Jsoup way
                org.jsoup.nodes.Document doc = Jsoup.parse(html);
                Elements tds = doc.select("td:contains(10110)");
                if(tds != null && tds.size() > 0){
                    for(Element td : tds.first().siblingElements()){
                        System.out.println(td.text());
                    }
                }
            } catch (ParserConfigurationException e) {
                e.printStackTrace();
            } catch (SAXException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } catch (XPathExpressionException e) {
                e.printStackTrace();
            }
        }

}

基于网址

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class SiblingJsoup {

    public static void main(String[] args) {
        try {
            Document doc = Jsoup
                    .connect("http://registroapps.uniandes.edu.co/scripts/adm_con_horario1_joomla.php?depto=IIND")
                    .timeout(20000)
                    .get();

            Elements tds = doc.select("font:containsOwn(10110)");
            if (tds != null && tds.size() > 0) {
                for (Element td : tds.parents().first().siblingElements()) {
                    System.out.println(td.text());
                }
            }
            System.out.println("Done");
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

}