解析嵌套在其他XML值中的XML标记

时间:2012-09-05 06:13:56

标签: java xml xpath xml-parsing jaxp

我很难开发一个特定的XML解析器来解析大量的XML。

我的问题是如何解析嵌套在其他XML值中的XML标记。 我的输入文件看起来像这样。

<main>
<step>
    <para>Calculate the values from the pool</para>
</step>
<step>
        <para>Use these(<internalRef id ="003" xlink:actuate="onRequest" xlink:show="replace" xlink:href="max003"/>) values finally</para>
</step>
</main>

我可以使用xpath获取第一步标记的值。 我的问题是如何使用xpath获取第二步值,或者如何识别新标记何时在值标记中启动。

对于Eg,我的第二步xpath返回结果 - 最后使用这些()值

我的目标是获取 - 最终使用这些( max003 )值

max003 值必须取自 xlink:href

添加 - 我可以通过编写单独的xpath来获取id的各个值,启动,显示。我的问题是我需要在获取 xlink:href后这些之后和之前在括号内填充 max003 值strong>值 max003 并通过电线发送以供显示。 所以我正在寻找一种方法来确定子节点ID的开始位置和时间? 将其填入括号内的机制。

3 个答案:

答案 0 :(得分:2)

单独使用XPath,您将无法做到这一点。你在那里有混合内容XML,这意味着一个元素可能包含文本值和子元素。您只能使用XPath一次引用其中一个,并且您也不能只从多个XPath表达式中获取内容,因为文本值可能围绕您在示例中声明的子元素。

我建议您使用XSLT转换文档,然后像现在一样使用XPath查询转换后的文档。另一种方法是编写自己的解析器,它能够正确处理嵌套元素。

这个XSLT可能适合你(没有经过彻底测试):

<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet 
  version="1.0" 
  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  xmlns:xlink="http://www.w3.org/1999/xlink">
    <xsl:output method="xml" indent="yes"/>

    <xsl:template match="@* | node()">
        <xsl:copy>
            <xsl:apply-templates select="@* | node()"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="internalRef">
        <xsl:value-of select="@xlink:href"/>
    </xsl:template>
</xsl:stylesheet>

当然,您需要使用XSLT处理器来转换原始文档。

解析器看起来像这样(注意这只是StAX解析器的骨架代码):

import java.io.StringReader;
import java.util.Iterator;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

public class ExampleStAXParser {

    private static final int STATE_UNDEFINED = 1;
    private static final int STATE_MAIN = 2;
    private static final int STATE_STEP = 3;
    private static final int STATE_PARA = 4;

    private static final String EL_MAIN = "main";
    private static final String EL_STEP = "step";
    private static final String EL_PARA = "para";
    private static final String EL_INTERNAL_REF = "internalRef";
    private static final String ATT_HREF = "href";

    private int state = STATE_UNDEFINED;
    private String characters;

    public void parse(String xmlString) throws XMLStreamException, Exception {


        XMLEventReader reader = null;
        try {
            if (xmlString == null || xmlString.isEmpty()) {
                throw new IllegalArgumentException("Illegal initializiation (xmlString is null or empty)");
            }
            StringReader stringReader = new StringReader(xmlString);
            XMLInputFactory inputFact = XMLInputFactory.newInstance();
            XMLStreamReader streamReader = inputFact.createXMLStreamReader(stringReader);
            reader = inputFact.createXMLEventReader(streamReader);

            while (reader.hasNext()) {
                XMLEvent event = reader.nextEvent();

                if (event.isCharacters()) {
                    characters(event);
                }
                if (event.isStartElement()) {
                    startElement(event);
                    // handle attributes
                    Iterator<Attribute> attributes = event.asStartElement().getAttributes();
                    while(attributes.hasNext()) {
                        attribute(attributes.next());
                    }
                }
                if (event.isEndElement()) {
                    endElement(event);
                }
                if (event.isStartDocument()) {
                    startDocument(event);
                }
                if (event.isEndDocument()) {
                    endDocument(event);
                }

            }            
        } catch (XMLStreamException ex) {
            throw ex;
        } finally {
            try {
                if (reader != null) {
                    reader.close();
                }
            } catch (XMLStreamException ex) {
            }
        }
    }

    private void attribute(XMLEvent event) throws Exception {
        if (state == STATE_PARA) {
            Attribute attr = (Attribute) event;
            String name = attr.getName().getLocalPart();
            if (ATT_HREF.equals(name)) {
                if (characters == null) {
                    characters = attr.getValue();
                } else {
                     characters += attr.getValue();
                }
            }
        } else
            throw new Exception("unexpected attribute");
    }

    private void characters(XMLEvent event) throws Exception {
        Characters asCharacters = event.asCharacters();
        if (asCharacters.isWhiteSpace())
            return;
        if (state == STATE_PARA) {            
            if (characters == null) {
                characters = asCharacters.getData();
            } else {
                 characters += asCharacters.getData();
            }
        } else
            throw new Exception("unexpected attribute");
    }

    private void startElement(XMLEvent event) throws Exception {
        StartElement startElement = event.asStartElement();
        String name = startElement.getName().getLocalPart();
        switch (state) {
            case STATE_UNDEFINED:
                if (name.equals(EL_MAIN)) {
                    state = STATE_MAIN;
                    System.out.println("Element: " + name);
                } else
                    throw new Exception("unexpected element");
                break;
            case STATE_MAIN:
                if (name.equals(EL_STEP)) {
                    state = STATE_STEP;
                    System.out.println("Element: " + name);
                } else
                    throw new Exception("unexpected element");
                break;
            case STATE_STEP:
                if (name.equals(EL_PARA)) {
                    state = STATE_PARA;
                    System.out.println("Element: " + name);
                } else
                    throw new Exception("unexpected element");
                break;
            case STATE_PARA:
                if (name.equals(EL_INTERNAL_REF)) {
                    System.out.println("Element: " + name);
                } else
                    throw new Exception("unexpected element");
                break;
            default:
                throw new Exception("unexpected element");
        }
    }

    private void endElement(XMLEvent event) throws Exception {
        EndElement endElement = event.asEndElement();
        String name = endElement.getName().getLocalPart();
        switch (state) {
            case STATE_MAIN:
                if (name.equals(EL_MAIN)) {
                    state = STATE_UNDEFINED;
                } else
                    throw new Exception("unexpected element");
                break;
            case STATE_STEP:
                if (name.equals(EL_STEP)) {
                    state = STATE_MAIN;
                } else
                    throw new Exception("unexpected element");
                break;
            case STATE_PARA:
                if (name.equals(EL_INTERNAL_REF)) {
                    // do nothing
                } else if (name.equals(EL_PARA)) {
                    System.out.println("Value: " + String.valueOf(characters));
                    characters = null;
                    state = STATE_STEP;
                } else
                    throw new Exception("unexpected element");
                break;
            default:
                throw new Exception("unexpected element");
        }
    }

    private void startDocument(XMLEvent event) {
        System.out.println("Parsing started");
    }

    private void endDocument(XMLEvent event) {
        System.out.println("Parsing ended");
    }

    public static void main(String[] argv) throws XMLStreamException, Exception {
        String xml = "";
        xml += "<main>";
        xml += "<step>";
        xml += "    <para>Calculate the values from the pool</para>";
        xml += "</step>";
        xml += "<step>";
        xml += "        <para>Use these(<internalRef id =\"003\" actuate=\"onRequest\" show=\"replace\" href=\"max003\"/>) values finally</para>";
        xml += "</step>";
        xml += "</main>";

        ExampleStAXParser parser = new ExampleStAXParser();
        parser.parse(xml);
    }
}

答案 1 :(得分:2)

评估此Xpath表达式:

 concat(/*/step[2]/para/text()[1],
        /*/step[2]/para/internalRef/@xlink:href,
        /*/step[2]/para/text()[2])

在提供的XML文档上(更正为名称空间格式良好):

<main xmlns:xlink="Undefined namespace">
    <step>
        <para>Calculate the values from the pool</para>
    </step>
    <step>
        <para>Use these(<internalRef id ="003" xlink:actuate="onRequest" xlink:show="replace" xlink:href="max003"/>) values finally</para>
    </step>
</main>

产生想要的结果

Use these(max003) values finally

请注意:您需要使用XPath API“注册xlink命名空间”,以便在不出错的情况下评估此XPath表达式。

基于XSLT的验证

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
 xmlns:xlink="Undefined namespace">
 <xsl:output method="text"/>
 <xsl:strip-space elements="*"/>

 <xsl:template match="/">
     <xsl:copy-of select=
     "concat(/*/step[2]/para/text()[1],
           /*/step[2]/para/internalRef/@xlink:href,
           /*/step[2]/para/text()[2])
     "/>
 </xsl:template>
</xsl:stylesheet>

在提供的XML文档(上面)上应用此转换时,将评估Xpath表达式并将此评估的结果复制到输出

Use these(max003) values finally

答案 2 :(得分:1)

尽我所知,我认为你的解析器看起来像你的结构

step
 +- para
     +-id

然后将“文本”内容包装在一起,提取出id节点......

(这种纯粹的推测)

<强>更新

如果我只是走节点树(列出每个孩子),这就是我得到的

 main
  step
    para
      #text - Calculate the values from the pool
  step
    para
      #text - Use these(
      id
      #text - ) values finally

这意味着“id”是“para”的孩子