假设我有以下html:
<html>
<head>
</head>
<body>
<div id="wrapper" >
<div class="s2">I am going <a title="some title" href="">by flying</a>
<p>mr tt</p>
</div>
</div>
</body>
</html>
文本节点中任何等于或大于4个字符的单词,例如单词“going”将替换为原始html中的html内容(非文本)<span>going<span>
,而不会更改任何其他内容。
如果我尝试执行element.html(替换)之类的操作,问题是如果让当前元素为<div class="s2">
它还会擦除<a title="some title"
答案 0 :(得分:12)
在这种情况下,您必须按照this answer的建议遍历您的文档。以下是使用Jsoup API执行此操作的方法:
NodeTraversor
和NodeVisitor
允许您遍历DOM Node.replaceWith(...)
允许替换DOM中的节点以下是代码:
public class JsoupReplacer {
public static void main(String[] args) {
so6527876();
}
public static void so6527876() {
String html =
"<html>" +
"<head>" +
"</head>" +
"<body>" +
" <div id=\"wrapper\" >" +
" <div class=\"s2\">I am going <a title=\"some title\" href=\"\">by flying</a>" +
" <p>mr tt</p>" +
" </div> " +
" </div>" +
"</body> " +
"</html>";
Document doc = Jsoup.parse(html);
final List<TextNode> nodesToChange = new ArrayList<TextNode>();
NodeTraversor nd = new NodeTraversor(new NodeVisitor() {
@Override
public void tail(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String text = textNode.getWholeText();
String[] words = text.trim().split(" ");
for (String word : words) {
if (word.length() > 4) {
nodesToChange.add(textNode);
break;
}
}
}
}
@Override
public void head(Node node, int depth) {
}
});
nd.traverse(doc.body());
for (TextNode textNode : nodesToChange) {
Node newNode = buildElementForText(textNode);
textNode.replaceWith(newNode);
}
System.out.println("result: ");
System.out.println();
System.out.println(doc);
}
private static Node buildElementForText(TextNode textNode) {
String text = textNode.getWholeText();
String[] words = text.trim().split(" ");
Set<String> longWords = new HashSet<String>();
for (String word : words) {
if (word.length() > 4) {
longWords.add(word);
}
}
String newText = text;
for (String longWord : longWords) {
newText = newText.replaceAll(longWord,
"<span>" + longWord + "</span>");
}
return new DataNode(newText, textNode.baseUri());
}
}
答案 1 :(得分:4)
我认为你需要穿越树。元素上 text()的结果将是Element的所有文本,包括子元素中的文本。希望以下代码对您有所帮助:
import java.io.File;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
public class ScreenScrape {
public static void main(String[] args) throws IOException {
String content = FileUtils.readFileToString(new File("test.html"));
Document doc = Jsoup.parse(content);
Element body = doc.body();
//System.out.println(body.toString());
StringBuilder sb = new StringBuilder();
traverse(body, sb);
System.out.println(sb.toString());
}
private static void traverse(Node n, StringBuilder sb) {
if (n instanceof Element) {
sb.append('<');
sb.append(n.nodeName());
if (n.attributes().size() > 0) {
sb.append(n.attributes().toString());
}
sb.append('>');
}
if (n instanceof TextNode) {
TextNode tn = (TextNode) n;
if (!tn.isBlank()) {
sb.append(spanifyText(tn.text()));
}
}
for (Node c : n.childNodes()) {
traverse(c, sb);
}
if (n instanceof Element) {
sb.append("</");
sb.append(n.nodeName());
sb.append('>');
}
}
private static String spanifyText(String text){
StringBuilder sb = new StringBuilder();
StringTokenizer st = new StringTokenizer(text);
String token;
while (st.hasMoreTokens()) {
token = st.nextToken();
if(token.length() > 3){
sb.append("<span>");
sb.append(token);
sb.append("</span>");
} else {
sb.append(token);
}
sb.append(' ');
}
return sb.substring(0, sb.length() - 1).toString();
}
}
<强>更新强>
使用Jonathan的新Jsoup List element.textNode()方法并将其与MarcoS建议的NodeTraversor / NodeVisitor技术相结合,我提出了(虽然我在修改树时穿过它 - 可能是一个坏主意):
Document doc = Jsoup.parse(content);
Element body = doc.body();
NodeTraversor nd = new NodeTraversor(new NodeVisitor() {
@Override
public void tail(Node node, int depth) {
if (node instanceof Element) {
boolean foundLongWord;
Element elem = (Element) node;
Element span;
String token;
StringTokenizer st;
ArrayList<Node> changedNodes;
Node currentNode;
for (TextNode tn : elem.textNodes()) {
foundLongWord = Boolean.FALSE;
changedNodes = new ArrayList<Node>();
st = new StringTokenizer(tn.text());
while (st.hasMoreTokens()) {
token = st.nextToken();
if (token.length() > 3) {
foundLongWord = Boolean.TRUE;
span = new Element(Tag.valueOf("span"), elem.baseUri());
span.appendText(token);
changedNodes.add(span);
} else {
changedNodes.add(new TextNode(token + " ", elem.baseUri()));
}
}
if (foundLongWord) {
currentNode = changedNodes.remove(0);
tn.replaceWith(currentNode);
for (Node n : changedNodes) {
currentNode.after(n);
currentNode = n;
}
}
}
}
}
@Override
public void head(Node node, int depth) {
}
});
nd.traverse(body);
System.out.println(body.toString());
答案 2 :(得分:0)
我用hello(span标签)替换word hello
Document doc = Jsoup.parse(content);
Element test = doc.body();
Elements elemenets = test.getAllElements();
for(int i =0 ;i <elemenets .size();i++){
String elementText = elemenets .get(i).text();
if(elementText.contains("hello"))
elemenets .get(i).html(l.get(i).text().replaceAll("hello","<span style=\"color:blue\">hello</span>"));
}