我有以下xml文档:
<?xml version="1.0" ?>
<tag>text<b><b>bold</b> bold again</b><b><br/>the end </tag>
我需要删除重复的标签但保留其内容,结果是:
<?xml version="1.0" ?>
<tag>text<b>bold bold again</b>the end </tag>
我有以下代码:
import java.io.*;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.w3c.dom.Document;
import org.w3c.dom.*;
import java.util.Arrays;
import javax.xml.transform.*;
import javax.xml.transform.dom.*;
import javax.xml.transform.stream.*;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
public class TakeDuplicatesXml{
public static void main(String[] args){
try{
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
Document doc = docBuilder.parse("/Users/youruser/code/Exercises/file.xml");
//get node list
List<String> aux = new ArrayList<String>();
removeDuplicate(doc.getDocumentElement(), aux);
//print the new document out
printXmlDocument(doc);
} catch (Exception ex) {
ex.printStackTrace();
}
}
public static void printXmlDocument(Document doc){
try{
DOMSource domSource = new DOMSource(doc);
StringWriter writer = new StringWriter();
StreamResult result = new StreamResult(writer);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.transform(domSource, result);
System.out.println("XML IN String format is: \n" + writer.toString());
}catch (Exception ex) {
ex.printStackTrace();
}
}
//with recursion
public static void removeDuplicate(Node node, List<String> aux){
System.out.println(node.getNodeName());
//check if that node exists already
if(aux.contains(node.getNodeName())){
node.getParentNode().removeChild(node);
}else{
//add node name to aux list
aux.add(node.getNodeName());
}
NodeList nodeList = node.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++) {
Node currentNode = nodeList.item(i);
if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
//calls this method for all the children which is Element
removeDuplicate(currentNode, aux);
}
}
}
}
但结果不是我想要的,因为它带有节点的内容(粗体字已经消失):
<tag>text<b> bold again</b><br/>the end </tag>
我该如何解决?我怎样才能让它更高效?
答案 0 :(得分:0)
<b>bold</b>
是<b> bold again</b>.
的孩子如果你执行node.getParentNode().removeChild(node)
它已经消失了。在删除之前,将子项的值添加到父项的值。
查询XML的另一种可能性是XPath。见XPath Tutorial:
无法判断这是否更具效果。但我想这是 - 支持自写的递归。
答案 1 :(得分:0)
所以我找到了解决方案。我仍然不确定它是否是最佳解决方案,但运作良好并按正确顺序排列内容:
//with recursion
public static void removeDuplicate(Node node, List<String> aux){
//check if that node exists already
if(aux.contains(node.getNodeName())){
Node parentNode = node.getParentNode();
String value = parentNode.getTextContent();
parentNode.removeChild(node);
parentNode.setTextContent(value);
}else{
//add node name to aux list
aux.add(node.getNodeName());
}
NodeList nodeList = node.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++) {
Node currentNode = nodeList.item(i);
if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
//calls this method for all the children which is Element
removeDuplicate(currentNode, aux);
}
}
答案 2 :(得分:0)
使用简单的XSLT转换可以最好地解决这类问题。您需要一个包含两个规则的样式表:一个复制所有未更改的标识规则
<xsl:template match="*">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:apply-templates select="child::node()"/>
</xsl:copy>
</xsl:template>
和删除嵌套b标记的另一个(更高优先级)规则:
<xsl:template match="b/b">
<xsl:apply-templates/>
</xsl:template>
将它们包裹在通常的样板中:
<xsl:stylesheet version="1.0" xmlns:xsl="http:www.w3.org/1999/XSL/Transform">
.. template rules go here ...
</xsl:stylesheet>
然后使用以下命令从Java程序中调用它:
public class TakeDuplicatesXml {
public static void main(String[] args){
try{
TransformerFactory tFactory = TransformerFactory.newInstance();
Templates t = tFactory.newTemplates(new File(... stylesheet file ....));
Source doc = new StreamSource(
new File("/Users/youruser/code/Exercises/file.xml"));
t.newTransformer().transform(doc, new StreamResult(System.out));
} catch (Exception ex) {
ex.printStackTrace();
}
}