将html标记存储在xml中

时间:2011-04-25 10:05:54

标签: java html xml

我有一个带有各种html标签的html格式的String。我想把这个字符串放在xml标签中,以便html标签保留。 e.g。

public class XMLfunctions {

    public final static Document XMLfromString(String xml){

        Document doc = null;

        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
                try {

            DocumentBuilder db = dbf.newDocumentBuilder();

            InputSource is = new InputSource();
            is.setCharacterStream(new StringReader(xml));
            doc = db.parse(is); 

        } catch (ParserConfigurationException e) {
            System.out.println("XML parse error: " + e.getMessage());
            return null;
        } catch (SAXException e) {
            System.out.println("Wrong XML file structure: " + e.getMessage());
                        return null;
        } catch (IOException e) {
            System.out.println("I/O exeption: " + e.getMessage());
            return null;
        }

            return doc;

    }


    /** Returns element value
      * @param elem element (it is XML tag)
      * @return Element value otherwise empty String
      */
     public final static String getElementValue( Node elem ) {
         Node kid;
         if( elem != null){
             if (elem.hasChildNodes()){
                 for( kid = elem.getFirstChild(); kid != null; kid = kid.getNextSibling() ){
                     if( kid.getNodeType() == Node.TEXT_NODE  ){
                         return kid.getNodeValue();
                     }
                 }
             }
         }
         return "";
     }

     /*Start Parsing Body */
     public static String getBodyXML(String id){     
            String line = null;
            try {
                DefaultHttpClient httpClient = new DefaultHttpClient();
                HttpPost httpPost = new HttpPost("http://192.168.1.44:9090/solr/core0/select/?q=content_id:"+id+"&version=2.2&start=0&rows=10&indent=on");
                HttpResponse httpResponse = httpClient.execute(httpPost);
                HttpEntity httpEntity = httpResponse.getEntity();
                line = EntityUtils.toString(httpEntity);

            } catch (UnsupportedEncodingException e) {
                line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
            } catch (MalformedURLException e) {
                line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
            } catch (IOException e) {
                line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
            }
            String st= ParseXMLBodyNode(line,"doc");
            return st;

    }

    public static String ParseXMLBodyNode(String str,String node){
         String xmlRecords = str;
         String results = "";
         String[] result = new String [1];
         StringBuffer sb = new StringBuffer();
         StringBuffer text = new StringBuffer(); 
         try {
             DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
             DocumentBuilder db = dbf.newDocumentBuilder();
             InputSource is = new InputSource();
             is.setCharacterStream(new StringReader(xmlRecords));
             Document doc = db.parse(is);
             NodeList indiatimes1 = doc.getElementsByTagName(node);
             sb.append("<results count=");
             sb.append("\"1\"");
             sb.append(">\r\n");

             for (int i = 0; i < indiatimes1.getLength(); i++) {
                Node node1 = indiatimes1.item(i);
                if (node1.getNodeType() == Node.ELEMENT_NODE) {
                    Element element = (Element) node1;
                    NodeList nodelist = element.getElementsByTagName("str");
                    Element element1 = (Element) nodelist.item(0);
                    NodeList title = element1.getChildNodes();
                    title.getLength();
                    for(int j=0; j<title.getLength();j++){
                        text.append(title.item(j).getNodeValue());
                    }
                    System.out.print((title.item(0)).getNodeValue());
                    sb.append("<result>\r\n");
                        sb.append("<body>");
                        String tmpText = html2text(text.toString());
                            sb.append("<![CDATA[<body>");
                            sb.append(tmpText);
                            sb.append("</body>]]>");
                        sb.append("</body>\r\n");
                    sb.append("</result>\r\n");
                    result[i] = title.item(0).getNodeValue();
                }
             }
             sb.append("</results>");
         } catch (Exception e) {
             System.out.println("Exception........"+results );
             e.printStackTrace();
         }
         return sb.toString();
     }
    /*End Parsing Body*/


    public static int numResults(Document doc){     
        Node results = doc.getDocumentElement();
        int res = -1;
        try{
            res = Integer.valueOf(results.getAttributes().getNamedItem("count").getNodeValue());
        }catch(Exception e ){
            res = -1;
        }
        return res;
    }

    public static String getValue(Element item, String str) {       
        NodeList n = item.getElementsByTagName(str);        
        return XMLfunctions.getElementValue(n.item(0));
    }


    public static String html2text(String html) {

        String pText = Jsoup.clean(html, Whitelist.basic());
        return pText;
    }

}

我将这些功能称为

String xml = XMLfunctions.getBodyXML(id);

Document doc = XMLfunctions.XMLfromString(xml);

我希望字体标记在xml中作为html标记。

帮助将不胜感激!!!!!

1 个答案:

答案 0 :(得分:12)

将您的HTML封装在CDATA section中,这样就不会将其视为XML的一部分,而只是普通文本:

<result>
<![CDATA[
    <body><font size="2px" face="arial">Hello World</font></body>
]]>
</result>

<强>更新

你的问题可能就在这里:

sb.append("<result>\r\n");
    sb.append("<body>");
    String tmpText = html2text(text.toString());
        sb.append("<![CDATA[<body>");
        sb.append(tmpText);
        sb.append("</body>]]>");
    sb.append("</body>\r\n");
sb.append("</result>\r\n");

请注意CDATA部分周围的sb.append("<body>");sb.append("</body>\r\n");行,它们可能导致无法正确读取XML的问题。也许你应该删除这两行,所以它看起来像这样:

sb.append("<result>\r\n");
    String tmpText = html2text(text.toString());
    sb.append("<![CDATA[<body>");
    sb.append(tmpText);
    sb.append("</body>]]>");
sb.append("</result>\r\n");