使用JSoup基于标签范围的HTML搜索

时间:2012-10-19 10:08:29

标签: jsoup

有以下HTML,请告诉我如何使用JSoup从"<html""<a id="summary"></a>"获取文本,尝试使用以下正则表达式,但它返回空字符串。

doc.select("*:matches(^[<html]*[a>]$)")

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>TestNG:  Unit Test</title>
 </head>
<body>
<a id="summary"></a>

<table cellspacing=0 cellpadding=0 class="param" style="float: left; width:630px;">
<tr><th>Test</th><th class="numi">Methods<br/>Passed</th><th class="numi">Scenarios<br/>Passed</th><th class="numi"># skipped</th><th class="numi"># failed</th><th class="numi">Total<br/>Time</th><th class="numi">Included<br/>Groups</th><th class="numi">Excluded<br/>Groups</th></tr>   
</table>


</body></html>

2 个答案:

答案 0 :(得分:0)

这有点棘手,因为你必须首先深入遍历DOM。 NodeTraversor允许您这样做。

以下是一个例子:

package stuff;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

public class A {

    public static void main(String[] args) {
        String html = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" +
                "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
                "<head>" +
                "<title>TestNG:  Unit Test</title>" +
                "</head>" +
                "<body>" +
                "<a id=\"summary\"></a>" +
                "<table cellspacing=0 cellpadding=0 class=\"param\" style=\"float: left; width:630px;\">" +
                "<tr><th>Test</th><th class=\"numi\">Methods<br/>Passed</th><th class=\"numi\">Scenarios<br/>Passed</th><th class=\"numi\"># skipped</th><th class=\"numi\"># failed</th><th class=\"numi\">Total<br/>Time</th><th class=\"numi\">Included<br/>Groups</th><th class=\"numi\">Excluded<br/>Groups</th></tr>" +
                "</table>" +
                "</body>" +
                "</html>";
        System.out.println(parse(html));
        String html2 = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" +
                "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
                "<head>" +
                "<title>TestNG:  Unit Test</title>" +
                "</head>" +
                "<body>" +
                "<a id=\"something_else\"></a>" +
                "<a id=\"summary\"></a>" +
                "<table cellspacing=0 cellpadding=0 class=\"param\" style=\"float: left; width:630px;\">" +
                "<tr><th>Test</th><th class=\"numi\">Methods<br/>Passed</th><th class=\"numi\">Scenarios<br/>Passed</th><th class=\"numi\"># skipped</th><th class=\"numi\"># failed</th><th class=\"numi\">Total<br/>Time</th><th class=\"numi\">Included<br/>Groups</th><th class=\"numi\">Excluded<br/>Groups</th></tr>" +
                "</table>" +
                "</body>" +
                "</html>";
        System.out.println(parse(html2));
    }

    public static String parse(String html) {
        Document document = Jsoup.parse(html);
        final StringBuffer buffer = new StringBuffer();
        NodeTraversor nd = new NodeTraversor(new NodeVisitor() {

            private boolean finished = false;

            @Override
            public void tail(Node node, int depth) {
                if (!finished && node instanceof Element) {
                    Element element = (Element) node;
                    if ("a".equals(element.tagName()) && element.hasAttr("id")
                            && "summary".equals(element.attr("id")))
                        finished = true;
                    else
                        buffer.append(element.toString());
                }
            }

            @Override
            public void head(Node arg0, int arg1) {
            }
        });
        buffer.append(document.head().html());
        buffer.append("<body>");
        nd.traverse(document.body());
        return buffer.toString();
    }
}

这不是特别好(特别是在做buffer.append("<body>");时)......但那很快:)

有关相关示例,另请参阅this answer

答案 1 :(得分:0)

我不确定,但你可以尝试一下.. 当元素由&#34; a&#34;触发时标记它将停止在边界循环

Elements doc=select("what u want");
String dummy="";
for (Element e:doc){

    if (dummy.isEmpty()){
        System.out.println(e);
        if (e.tagName().matches("a")){
            dummy=e.tagName();
        }

    }
}