有以下HTML,请告诉我如何使用JSoup从"<html"
到"<a id="summary"></a>"
获取文本,尝试使用以下正则表达式,但它返回空字符串。
doc.select("*:matches(^[<html]*[a>]$)")
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>TestNG: Unit Test</title>
</head>
<body>
<a id="summary"></a>
<table cellspacing=0 cellpadding=0 class="param" style="float: left; width:630px;">
<tr><th>Test</th><th class="numi">Methods<br/>Passed</th><th class="numi">Scenarios<br/>Passed</th><th class="numi"># skipped</th><th class="numi"># failed</th><th class="numi">Total<br/>Time</th><th class="numi">Included<br/>Groups</th><th class="numi">Excluded<br/>Groups</th></tr>
</table>
</body></html>
答案 0 :(得分:0)
这有点棘手,因为你必须首先深入遍历DOM。 NodeTraversor允许您这样做。
以下是一个例子:
package stuff;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
public class A {
public static void main(String[] args) {
String html = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" +
"<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
"<head>" +
"<title>TestNG: Unit Test</title>" +
"</head>" +
"<body>" +
"<a id=\"summary\"></a>" +
"<table cellspacing=0 cellpadding=0 class=\"param\" style=\"float: left; width:630px;\">" +
"<tr><th>Test</th><th class=\"numi\">Methods<br/>Passed</th><th class=\"numi\">Scenarios<br/>Passed</th><th class=\"numi\"># skipped</th><th class=\"numi\"># failed</th><th class=\"numi\">Total<br/>Time</th><th class=\"numi\">Included<br/>Groups</th><th class=\"numi\">Excluded<br/>Groups</th></tr>" +
"</table>" +
"</body>" +
"</html>";
System.out.println(parse(html));
String html2 = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" +
"<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
"<head>" +
"<title>TestNG: Unit Test</title>" +
"</head>" +
"<body>" +
"<a id=\"something_else\"></a>" +
"<a id=\"summary\"></a>" +
"<table cellspacing=0 cellpadding=0 class=\"param\" style=\"float: left; width:630px;\">" +
"<tr><th>Test</th><th class=\"numi\">Methods<br/>Passed</th><th class=\"numi\">Scenarios<br/>Passed</th><th class=\"numi\"># skipped</th><th class=\"numi\"># failed</th><th class=\"numi\">Total<br/>Time</th><th class=\"numi\">Included<br/>Groups</th><th class=\"numi\">Excluded<br/>Groups</th></tr>" +
"</table>" +
"</body>" +
"</html>";
System.out.println(parse(html2));
}
public static String parse(String html) {
Document document = Jsoup.parse(html);
final StringBuffer buffer = new StringBuffer();
NodeTraversor nd = new NodeTraversor(new NodeVisitor() {
private boolean finished = false;
@Override
public void tail(Node node, int depth) {
if (!finished && node instanceof Element) {
Element element = (Element) node;
if ("a".equals(element.tagName()) && element.hasAttr("id")
&& "summary".equals(element.attr("id")))
finished = true;
else
buffer.append(element.toString());
}
}
@Override
public void head(Node arg0, int arg1) {
}
});
buffer.append(document.head().html());
buffer.append("<body>");
nd.traverse(document.body());
return buffer.toString();
}
}
这不是特别好(特别是在做buffer.append("<body>");
时)......但那很快:)
有关相关示例,另请参阅this answer。
答案 1 :(得分:0)
我不确定,但你可以尝试一下.. 当元素由&#34; a&#34;触发时标记它将停止在边界循环
Elements doc=select("what u want");
String dummy="";
for (Element e:doc){
if (dummy.isEmpty()){
System.out.println(e);
if (e.tagName().matches("a")){
dummy=e.tagName();
}
}
}