Java DOM在xml节点中解析html

时间:2012-09-27 13:40:50

标签: java xml

我在这里有一个解析器:

package lt.prasom.functions;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.util.Properties;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.w3c.dom.CharacterData;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import android.annotation.TargetApi;
import android.media.MediaRecorder.OutputFormat;
import android.util.Log;

public class XMLParser {

    // constructor
    public XMLParser() {

    }

    /**
     * Getting XML from URL making HTTP request
     * @param url string
     * */
    public String getXmlFromUrl(String url) {
        String xml = null;

        try {
            // defaultHttpClient
            DefaultHttpClient httpClient = new DefaultHttpClient();
            HttpGet httpGet = new HttpGet(url);

            HttpResponse httpResponse = httpClient.execute(httpGet);
            HttpEntity httpEntity = httpResponse.getEntity();
            xml = EntityUtils.toString(httpEntity);

        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        // return XML
        return xml;
    }

    /**
     * Getting XML DOM element
     * @param XML string
     * */
    public Document getDomElement(String xml){
        Document doc = null;
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        dbf.setValidating(false);

        try {

            DocumentBuilder db = dbf.newDocumentBuilder();

            InputSource is = new InputSource();
                is.setCharacterStream(new StringReader(xml));
                doc = db.parse(is); 

            } catch (ParserConfigurationException e) {
                Log.e("Error: ", e.getMessage());
                return null;
            } catch (SAXException e) {
                Log.e("Error: ", e.getMessage());
                return null;
            } catch (IOException e) {
                Log.e("Error: ", e.getMessage());
                return null;
            }

            return doc;
    }

    /** Getting node value
      * @param elem element
      */
     @TargetApi(8)
    public final String getElementValue( Node elem , boolean html) {
         Node child;
         if( elem != null){
             if (elem.hasChildNodes()){
                 for( child = elem.getFirstChild(); child != null; child = child.getNextSibling() ){
                     if( child.getNodeType() == Node.TEXT_NODE  ){


                         //return child.getNodeValue();
                         return child.getNodeValue();
                     }
                 }
             }
         }
         return "";
     }

     /**
      * Getting node value
      * @param Element node
      * @param key string
      * */

     public String getValue(Element item, String str) {     
            NodeList n = item.getElementsByTagName(str);    

            return this.getElementValue(n.item(0), false);
        }

}

还有我的样本xml:

<items>
<item>
<name>test</name>
<description>yes <b>no</b></description>
</item>
</items>

当我解析描述时,我正在标记所有内容(“是”)。所以我想在描述标签中解析原始数据。我试过CDATA标签没用。是不是没有编码xml?

谢谢!

1 个答案:

答案 0 :(得分:1)

我同意关于这个问题的评论不完整,或者具体到足以直接回答(比如修改你的工作来源等),但我必须做一些有点类似的事情(我认为)并且可以添加这个。它可能有所帮助。

因此,如果,IF,“description”元素的内容本身就是有效的XML,那么说文档实际上看起来像:

<items>
  <item>
   <name>test</name>
   <description><span>yes <b>no</b></span></description>
  </item>
</items>

然后您可以将“description”元素的内容作为新的XML文档进行破解,然后获取看起来像的XML文本表单:

<span>yes <b>no</b></span>

这样的方法类似于:

/**
 * Get the Description as a new XML document
 *
 */
public Document retrieveDescriptionAsDocument(Document sourceDocument) {

Document document;
Node tmpNode;
Document document2 = null;

try {
    // get the description node, I am just using XPath here as it is easy
    // to read, you already have a reference to the node so just continue as you
    // were doing for that, bottom line is to get a reference to the node
    tmpNode = org.apache.xpath.XPathAPI.selectSingleNode(sourceDocument,"/items/item/description");

    if (tmpNode != null) {

        // create a new empty document
        document2 = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        // associate the node with the original document
        sourceDocument.importNode(tmpNode, true);
        // create a document fragment from the original document
        DocumentFragment df = sourceDocument.createDocumentFragment();
        // append the node you found, to the fragment   
        df.appendChild(tmpNode);
        // create the Node to append to the new DOM
        Node importNode = document2.importNode(df,true);
        // append the fragment (as a node) to the new empty document
        Document2.appendChild(importNode);
    }
    else {
        // LOG WARNING
        yourLoggerOrWhatever.warn("retrieveContainedDocument: No data found for XPath:" + xpathP);
    }

    } catch (Exception e) {
        // LOG ERROR
        yourLoggerOrWhatever.error("Exception caught getting contained document:",e);
    }

    // return the new doc, and the caller can then output that new document, that will now just contain "<span>yes <b>no</b></span>" as text, apply an XSL or whatever
    return document2;
}