Java - 将命名的html实体转换为编号的xml实体

时间:2012-05-03 20:15:38

标签: java html xml parsing entities

我希望将包含html命名实体的html块转换为符合xml标准的块,该块使用编号的xml实体,同时保留所有html标记元素。

这是通过测试说明的基本思想:

@Test
public void testEvalHtmlEntitiesToXmlEntities() {
    String input = "<a href=\"test.html\">link&nbsp;</a>";
    String expected = "<a href=\"test.html\">link&#160;</a>";
    String actual = SomeUtil.eval(input);
    Assert.assertEquals(expected, actual);
}

是否有人知道提供此功能的类?我可以写一个正则表达式迭代非元素匹配并执行:

xlmString += StringEscapeUtils.escapeXml(StringEscapeUtils.unescapeHtml(htmlString));

但希望有一种更简单的方法或已经提供此类的课程。

5 个答案:

答案 0 :(得分:3)

您是否尝试过使用JTidy

private String cleanData(String data) throws UnsupportedEncodingException {
    Tidy tidy = new Tidy();
    tidy.setInputEncoding("UTF-8");
    tidy.setOutputEncoding("UTF-8");
    tidy.setPrintBodyOnly(true); // only print the content
    tidy.setXmlOut(true); // to XML
    tidy.setSmartIndent(true); 
    ByteArrayInputStream inputStream = new ByteArrayInputStream(data.getBytes("UTF-8"));
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    tidy.parseDOM(inputStream, outputStream);
    return outputStream.toString("UTF-8");
}

虽然我认为它会修复一些HTML代码以防万一。

答案 1 :(得分:3)

这可能对您有用。

private static Map<String, String> entityMap = new HashMap<String, String>();
static{
    entityMap.put("nbsp", "&#160;");
    entityMap.put("iexcl", "&#161;");
    entityMap.put("cent", "&#162;");
    entityMap.put("pound", "&#163;");
    entityMap.put("curren", "&#164;");
    entityMap.put("yen", "&#165;");
    entityMap.put("brvbar", "&#166;");
    entityMap.put("sect", "&#167;");
    entityMap.put("uml", "&#168;");
    entityMap.put("copy", "&#169;");
    entityMap.put("ordf", "&#170;");
    entityMap.put("laquo", "&#171;");
    entityMap.put("not", "&#172;");
    entityMap.put("shy", "&#173;");
    entityMap.put("reg", "&#174;");
    entityMap.put("macr", "&#175;");
    entityMap.put("deg", "&#176;");
    entityMap.put("plusmn", "&#177;");
    entityMap.put("sup2", "&#178;");
    entityMap.put("sup3", "&#179;");
    entityMap.put("acute", "&#180;");
    entityMap.put("micro", "&#181;");
    entityMap.put("para", "&#182;");
    entityMap.put("middot", "&#183;");
    entityMap.put("cedil", "&#184;");
    entityMap.put("sup1", "&#185;");
    entityMap.put("ordm", "&#186;");
    entityMap.put("raquo", "&#187;");
    entityMap.put("frac14", "&#188;");
    entityMap.put("frac12", "&#189;");
    entityMap.put("frac34", "&#190;");
    entityMap.put("iquest", "&#191;");
    entityMap.put("Agrave", "&#192;");
    entityMap.put("Aacute", "&#193;");
    entityMap.put("Acirc", "&#194;");
    entityMap.put("Atilde", "&#195;");
    entityMap.put("Auml", "&#196;");
    entityMap.put("Aring", "&#197;");
    entityMap.put("AElig", "&#198;");
    entityMap.put("Ccedil", "&#199;");
    entityMap.put("Egrave", "&#200;");
    entityMap.put("Eacute", "&#201;");
    entityMap.put("Ecirc", "&#202;");
    entityMap.put("Euml", "&#203;");
    entityMap.put("Igrave", "&#204;");
    entityMap.put("Iacute", "&#205;");
    entityMap.put("Icirc", "&#206;");
    entityMap.put("Iuml", "&#207;");
    entityMap.put("ETH", "&#208;");
    entityMap.put("Ntilde", "&#209;");
    entityMap.put("Ograve", "&#210;");
    entityMap.put("Oacute", "&#211;");
    entityMap.put("Ocirc", "&#212;");
    entityMap.put("Otilde", "&#213;");
    entityMap.put("Ouml", "&#214;");
    entityMap.put("times", "&#215;");
    entityMap.put("Oslash", "&#216;");
    entityMap.put("Ugrave", "&#217;");
    entityMap.put("Uacute", "&#218;");
    entityMap.put("Ucirc", "&#219;");
    entityMap.put("Uuml", "&#220;");
    entityMap.put("Yacute", "&#221;");
    entityMap.put("THORN", "&#222;");
    entityMap.put("szlig", "&#223;");
    entityMap.put("agrave", "&#224;");
    entityMap.put("aacute", "&#225;");
    entityMap.put("acirc", "&#226;");
    entityMap.put("atilde", "&#227;");
    entityMap.put("auml", "&#228;");
    entityMap.put("aring", "&#229;");
    entityMap.put("aelig", "&#230;");
    entityMap.put("ccedil", "&#231;");
    entityMap.put("egrave", "&#232;");
    entityMap.put("eacute", "&#233;");
    entityMap.put("ecirc", "&#234;");
    entityMap.put("euml", "&#235;");
    entityMap.put("igrave", "&#236;");
    entityMap.put("iacute", "&#237;");
    entityMap.put("icirc", "&#238;");
    entityMap.put("iuml", "&#239;");
    entityMap.put("eth", "&#240;");
    entityMap.put("ntilde", "&#241;");
    entityMap.put("ograve", "&#242;");
    entityMap.put("oacute", "&#243;");
    entityMap.put("ocirc", "&#244;");
    entityMap.put("otilde", "&#245;");
    entityMap.put("ouml", "&#246;");
    entityMap.put("divide", "&#247;");
    entityMap.put("oslash", "&#248;");
    entityMap.put("ugrave", "&#249;");
    entityMap.put("uacute", "&#250;");
    entityMap.put("ucirc", "&#251;");
    entityMap.put("uuml", "&#252;");
    entityMap.put("yacute", "&#253;");
    entityMap.put("thorn", "&#254;");
    entityMap.put("yuml", "&#255;");
    entityMap.put("fnof", "&#192;");
    entityMap.put("Alpha", "&#913;");
    entityMap.put("Beta", "&#914;");
    entityMap.put("Gamma", "&#915;");
    entityMap.put("Delta", "&#916;");
    entityMap.put("Epsilon", "&#917;");
    entityMap.put("Zeta", "&#918;");
    entityMap.put("Eta", "&#919;");
    entityMap.put("Theta", "&#920;");
    entityMap.put("Iota", "&#921;");
    entityMap.put("Kappa", "&#922;");
    entityMap.put("Lambda", "&#923;");
    entityMap.put("Mu", "&#924;");
    entityMap.put("Nu", "&#925;");
    entityMap.put("Xi", "&#926;");
    entityMap.put("Omicron", "&#927;");
    entityMap.put("Pi", "&#928;");
    entityMap.put("Rho", "&#929;");
    entityMap.put("Sigma", "&#931;");
    entityMap.put("Tau", "&#932;");
    entityMap.put("Upsi", "&#933;");
    entityMap.put("Phi", "&#934;");
    entityMap.put("Chi", "&#935;");
    entityMap.put("Psi", "&#936;");
    entityMap.put("Omega", "&#937;");
    entityMap.put("alpha", "&#945;");
    entityMap.put("beta", "&#946;");
    entityMap.put("gamma", "&#947;");
    entityMap.put("delta", "&#948;");
    entityMap.put("epsi", "&#949;");
    entityMap.put("zeta", "&#950;");
    entityMap.put("eta", "&#951;");
    entityMap.put("theta", "&#952;");
    entityMap.put("iota", "&#953;");
    entityMap.put("kappa", "&#954;");
    entityMap.put("lambda", "&#955;");
    entityMap.put("mu", "&#956;");
    entityMap.put("nu", "&#957;");
    entityMap.put("xi", "&#958;");
    entityMap.put("omicron", "&#959;");
    entityMap.put("pi", "&#960;");
    entityMap.put("rho", "&#961;");
    entityMap.put("sigmaf", "&#962;");
    entityMap.put("sigma", "&#963;");
    entityMap.put("tau", "&#964;");
    entityMap.put("upsi", "&#965;");
    entityMap.put("phi", "&#966;");
    entityMap.put("chi", "&#967;");
    entityMap.put("psi", "&#968;");
    entityMap.put("omega", "&#969;");
    entityMap.put("theta", "&#977;");
    entityMap.put("upsih", "&#978;");
    entityMap.put("piv", "&#982;");
    entityMap.put("bull", "&#8226;");
    entityMap.put("hellip", "&#8230;");
    entityMap.put("prime", "&#8242;");
    entityMap.put("Prime", "&#8243;");
    entityMap.put("oline", "&#8254;");
    entityMap.put("frasl", "&#8260;");
    entityMap.put("weierp", "&#8472;");
    entityMap.put("image", "&#8465;");
    entityMap.put("real", "&#8476;");
    entityMap.put("trade", "&#8482;");
    entityMap.put("alefsym", "&#8501;");
    entityMap.put("larr", "&#8592;");
    entityMap.put("uarr", "&#8593;");
    entityMap.put("rarr", "&#8594;");
    entityMap.put("darr", "&#8595;");
    entityMap.put("harr", "&#8596;");
    entityMap.put("crarr", "&#8629;");
    entityMap.put("lArr", "&#8656;");
    entityMap.put("uArr", "&#8657;");
    entityMap.put("rArr", "&#8658;");
    entityMap.put("dArr", "&#8659;");
    entityMap.put("hArr", "&#8660;");
    entityMap.put("forall", "&#8704;");
    entityMap.put("part", "&#8706;");
    entityMap.put("exist", "&#8707;");
    entityMap.put("empty", "&#8709;");
    entityMap.put("nabla", "&#8711;");
    entityMap.put("isin", "&#8712;");
    entityMap.put("notin", "&#8713;");
    entityMap.put("ni", "&#8715;");
    entityMap.put("prod", "&#8719;");
    entityMap.put("sum", "&#8722;");
    entityMap.put("minus", "&#8722;");
    entityMap.put("lowast", "&#8727;");
    entityMap.put("radic", "&#8730;");
    entityMap.put("prop", "&#8733;");
    entityMap.put("infin", "&#8734;");
    entityMap.put("ang", "&#8736;");
    entityMap.put("and", "&#8869;");
    entityMap.put("or", "&#8870;");
    entityMap.put("cap", "&#8745;");
    entityMap.put("cup", "&#8746;");
    entityMap.put("int", "&#8747;");
    entityMap.put("there4", "&#8756;");
    entityMap.put("sim", "&#8764;");
    entityMap.put("cong", "&#8773;");
    entityMap.put("asymp", "&#8773;");
    entityMap.put("ne", "&#8800;");
    entityMap.put("equiv", "&#8801;");
    entityMap.put("le", "&#8804;");
    entityMap.put("ge", "&#8805;");
    entityMap.put("sub", "&#8834;");
    entityMap.put("sup", "&#8835;");

    entityMap.put("nsub", "&#8836;");
    entityMap.put("sube", "&#8838;");
    entityMap.put("supe", "&#8839;");
    entityMap.put("oplus", "&#8853;");
    entityMap.put("otimes", "&#8855;");
    entityMap.put("perp", "&#8869;");
    entityMap.put("sdot", "&#8901;");

    entityMap.put("lceil", "&#8968;");
    entityMap.put("rceil", "&#8969;");
    entityMap.put("lfloor", "&#8970;");
    entityMap.put("rfloor", "&#8971;");
    entityMap.put("lang", "&#9001;");


    entityMap.put("loz", "&#9674;");

    entityMap.put("spades", "&#9824;");
    entityMap.put("clubs", "&#9827;");
    entityMap.put("hearts", "&#9829;");
    entityMap.put("diams", "&#9830;");


    entityMap.put("quot", "&#34;");
    entityMap.put("amp", "&#38;");
    entityMap.put("lt", "&#60;");
    entityMap.put("gt", "&#62;");

    entityMap.put("OElig", "&#338;");
    entityMap.put("oelig", "&#339;");
    entityMap.put("Scaron", "&#352;");
    entityMap.put("scaron", "&#353;");
    entityMap.put("Yuml", "&#376;");

    entityMap.put("circ", "&#710;");
    entityMap.put("tilde", "&#732;");

    entityMap.put("ensp", "&#8194;");
    entityMap.put("emsp", "&#8195;");
    entityMap.put("thinsp", "&#8201;");
    entityMap.put("zwnj", "&#8204;");
    entityMap.put("zwj", "&#8205;");
    entityMap.put("lrm", "&#8206;");
    entityMap.put("rlm", "&#8207;");
    entityMap.put("ndash", "&#8211;");
    entityMap.put("mdash", "&#8212;");
    entityMap.put("lsquo", "&#8216;");
    entityMap.put("rsquo", "&#8217;");
    entityMap.put("sbquo", "&#8218;");
    entityMap.put("ldquo", "&#8220;");
    entityMap.put("rdquo", "&#8221;");
    entityMap.put("bdquo", "&#8222;");
    entityMap.put("dagger", "&#8224;");
    entityMap.put("Dagger", "&#8225;");
    entityMap.put("permil", "&#8240;");
    entityMap.put("lsaquo", "&#8249;");
    entityMap.put("rsaquo", "&#8250;");
}

然后我只是将数据作为DOCTYPE

附加到文档中
  StringBuffer buffer = new StringBuffer();
        buffer.append("<?xml version=\"1.0\"?> " + " <!DOCTYPE some_name [ ");
        Iterator<Entry<String, String>> iterator = entityMap.entrySet().iterator();
        while (iterator.hasNext()) {
            Entry<String, String> entry = iterator.next();
            buffer.append("<!ENTITY " + entry.getKey() + " \"" + entry.getValue() + "\">");
        }
        buffer.append(" ]>");

        convertedData = buffer.toString() + convertedData;

答案 2 :(得分:3)

如果你已经在类路径上有公共语言,请查看EntityArrays中的数组;它们包含所有实体的映射。

要获取数值,只需在第一个元素(Unicode字符)上使用codePointAt(0)

现在您需要一个基于正则表达式的循环来搜索&[^;]+;。这是非常安全的,因为&是一个需要转义的特殊字符。如果您需要100%确定,请查找CDATA元素并忽略它们。

答案 3 :(得分:3)

这就是我用完的东西。似乎工作正常:

/**
 * Some helper methods for XHTML => HTML manipulation
 * 
 * @author David Maple<d@davemaple.com>
 *
 */
public class XhtmlUtil {

    private static final Pattern ENTITY_PATTERN = Pattern.compile("(&[^\\s]+?;)");

    /**
     * Don't instantiate me
     */
    private XhtmlUtil() { } 

    /**
     * Convert a String of HTML with named HTML entities to the 
     * same String with entities converted to numbered XML entities 
     * 
     * @param html
     * @return xhtml
     */
    public static String htmlToXmlEntities(String html) {
        StringBuffer stringBuffer = new StringBuffer();
        Matcher matcher = ENTITY_PATTERN.matcher(html);

        while (matcher.find()) {
            String replacement = htmlEntityToXmlEntity(matcher.group(1));
            matcher.appendReplacement(stringBuffer, "");
            stringBuffer.append(replacement);
        }

        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    /**
     * Replace an HTML entity with an XML entity
     * 
     * @param htmlEntity
     * @return xmlEntity
     */
    private static String htmlEntityToXmlEntity(String html) {
        return StringEscapeUtils.escapeXml(StringEscapeUtils.unescapeHtml(html));
    }

}

和相应的测试:

public class XhtmlUtilTest {

    @Test
    public void testEvalXmlEscape() {
        String input = "link 1 &nbsp;|&nbsp; link2 &amp; & dkdk;";
        String expected = "link 1 &#160;|&#160; link2 &amp; & dkdk;";
        String actual = XhtmlUtil.htmlToXmlEntities(input);
        System.out.println(actual);
        Assert.assertEquals(expected, actual);
    }

    @Test
    public void testEvalXmlEscape2() {
        String input = "<a href=\"test.html\">link&nbsp;</a>";
        String expected = "<a href=\"test.html\">link&#160;</a>";
        String actual = XhtmlUtil.htmlToXmlEntities(input);
        System.out.println(actual);
        Assert.assertEquals(expected, actual);
    }

    @Test
    public void testEvalXmlEscapeMultiLine() {
        String input = "<a href=\"test.html\">link&nbsp;</a>\n<a href=\"test.html\">link&nbsp;</a>";
        String expected = "<a href=\"test.html\">link&#160;</a>\n<a href=\"test.html\">link&#160;</a>";
        String actual = XhtmlUtil.htmlToXmlEntities(input);
        System.out.println(actual);
        Assert.assertEquals(expected, actual);
    }

}

答案 4 :(得分:1)

这是我使用的另一种解决方案

 /**
     * Converts the specified string which is in ASCII format to legal XML
     * format. Inspired by XMLWriter by http://www.megginson.com/Software/
     */
    public static String convertAsciiToXml(String string) {
        if (string == null || string.equals(""))
            return "";

        StringBuffer sbuf = new StringBuffer();
        char ch[] = string.toCharArray();
        for (int i = 0; i < ch.length; i++) {
            switch (ch[i]) {
                case '&':
                    sbuf.append("&amp;");
                    break;
                case '<':
                    sbuf.append("&lt;");
                    break;
                case '>':
                    sbuf.append("&gt;");
                    break;
                case '\"':
                    sbuf.append("&quot;");
                    break;
                default:
                    if (ch[i] > '\u007f') {
                        sbuf.append("&#");
                        sbuf.append(Integer.toString(ch[i]));
                        sbuf.append(';');
                    }
                    else if (ch[i] == '\t') {
                        sbuf.append(' ');
                        sbuf.append(' ');
                        sbuf.append(' ');
                        sbuf.append(' ');
                    }
                    else if ((int) ch[i] >= 32 || (ch[i] == '\n' || ch[i] == '\r')) {
                        sbuf.append(ch[i]);
                    }
            }
        }
        return sbuf.toString();
    }