在awk中转义HTML特殊字符

时间:2014-03-11 08:45:19

标签: html linux awk

从awk脚本我想生成一个HTML文件。我的字符串可能包含“<”之类的字符和“&”。 awk是否有一个简短且经过验证的功能可以逃脱?

2 个答案:

答案 0 :(得分:2)

不确定。只需为要转换的每一行(makeEntities())致电$0即可。或者修改它以接受参数。我之所以与英国国家语料库合作,它与HTML实体有很高的重叠,但不是100%,所以如果你需要一些异国情调的角色,你应该验证它们是正确的。

function makeEntities()  {    
    gsub(/á/,  "\\á");
    gsub(/Á/,  "\\Á");
    gsub(/ă/,  "\\ă");
    gsub(/â/,  "\\â");
    gsub(/´/,  "\\´");
    gsub(/æ/,  "\\æ");
    gsub(/Æ/,  "\\Æ");
    gsub(/α/,  "\\&agr;");
    gsub(/à/,  "\\à");
    gsub(/ā/,  "\\ā");
    gsub(/Ā/,  "\\Ā");
    gsub(/&/,  "\\&");
    gsub(/ą/,  "\\ą");
    gsub(/å/,  "\\å");
    gsub(/Å/,  "\\Å");
    gsub(/ã/,  "\\ã");
    gsub(/ä/,  "\\ä");
    gsub(/Ä/,  "\\Ä");
    gsub(/β/,  "\\&bgr;");
    gsub(/\\/, "\\\");
    gsub(/•/,  "\\•");
    gsub(/ć/,  "\\ć");
    gsub(/č/,  "\\č");
    gsub(/Č/,  "\\Č");
    gsub(/ç/,  "\\ç");
    gsub(/Ç/,  "\\Ç");
    gsub(/ĉ/,  "\\ĉ");
    gsub(/✓/,  "\\✓");
    gsub(/ˆ/,  "\\ˆ");
    gsub(/@/,  "\\@");
    gsub(/©/,  "\\©");
    gsub(/‐/,  "\\‐");
    gsub(/ď/,  "\\ď");
    gsub(/°/,  "\\°");
    gsub(/δ/,  "\\&dgr;");
    gsub(/Δ/,  "\\&Dgr;");
    gsub(/¨/,  "\\¨");
    gsub(/\$/, "\\$");
    gsub(/đ/,  "\\đ");
    gsub(/é/,  "\\é");
    gsub(/É/,  "\\É");
    gsub(/ě/,  "\\ě");
    gsub(/ê/,  "\\ê");
    gsub(/è/,  "\\è");
    gsub(/È/,  "\\È");
    gsub(/ε/,  "\\&egr;");
    gsub(/ē/,  "\\ē");
    gsub(/Ē/,  "\\Ē");
    gsub(/ę/,  "\\ę");
    gsub(/ð/,  "\\ð");
    gsub(/ë/,  "\\ë");
    gsub(/Ë/,  "\\Ë");
    gsub(/♭/,  "\\♭");
    gsub(/½/,  "\\½");
    gsub(/⅓/,  "\\⅓");
    gsub(/¼/,  "\\¼");
    gsub(/⅕/,  "\\⅕");
    gsub(/⅙/,  "\\⅙");
    gsub(/⅛/,  "\\⅛");
    gsub(/⅔/,  "\\⅔");
    gsub(/⅖/,  "\\⅖");
    gsub(/¾/,  "\\¾");
    gsub(/⅗/,  "\\⅗");
    gsub(/⅜/,  "\\⅜");
    gsub(/⅘/,  "\\⅘");
    gsub(/⅝/,  "\\⅝");
    gsub(/⅞/,  "\\⅞");
    gsub(/′/,  "\\&ft;");
    gsub(/γ/,  "\\&ggr;");
    gsub(/>/,  "\\>");
    gsub(/½/,  "\\½");
    gsub(/ħ/,  "\\ħ");
    gsub(/í/,  "\\í");
    gsub(/Í/,  "\\Í");
    gsub(/î/,  "\\î");
    gsub(/Î/,  "\\Î");
    gsub(/ì/,  "\\ì");
    gsub(/ī/,  "\\ī");
    gsub(/″/,  "\\&ins;");
    gsub(/¿/,  "\\¿");
    gsub(/ï/,  "\\ï");
    gsub(/Ï/,  "\\Ï");
    gsub(/ĺ/,  "\\ĺ");
    gsub(/Ĺ/,  "\\Ĺ");
    gsub(/\{/, "\\{");
    gsub(/≤/,  "\\≤");
    gsub(/λ/,  "\\&lgr;");
    gsub(/_/,  "\\_");
    gsub(/\[/, "\\[");
    gsub(/ł/,  "\\ł");
    gsub(/Ł/,  "\\Ł");
    gsub(/</,  "\\&lt;");
    gsub(/—/,  "\\&mdash;");
    gsub(/μ/,  "\\&mgr;");
    gsub(/µ/,  "\\&micro;");
    gsub(/·/,  "\\&middot;");
    gsub(/ń/,  "\\&nacute;");
    gsub(/ň/,  "\\&ncaron;");
    gsub(/ņ/,  "\\&ncedil;");
    gsub(/–/,  "\\&ndash;");
    gsub(/ñ/,  "\\&ntilde;");
    gsub(/Ñ/,  "\\&Ntilde;");
    gsub(/#/,  "\\&num;");
    gsub(/ó/,  "\\&oacute;");
    gsub(/Ó/,  "\\&Oacute;");
    gsub(/ô/,  "\\&ocirc;");
    gsub(/œ/,  "\\&oelig;");
    gsub(/ò/,  "\\&ograve;");
    gsub(/Ω/,  "\\&ohm;");
    gsub(/ō/,  "\\&omacr;");
    gsub(/ø/,  "\\&oslash;");
    gsub(/Ø/,  "\\&Oslash;");
    gsub(/õ/,  "\\&otilde;");
    gsub(/ö/,  "\\&ouml;");
    gsub(/Ö/,  "\\&Ouml;");
    gsub(/φ/,  "\\&phgr;");
    gsub(/\+/, "\\&plus;");
    gsub(/±/,  "\\&plusmn;");
    gsub(/£/,  "\\&pound;");
    gsub(/ŕ/,  "\\&racute;");
    gsub(/√/,  "\\&radic;");
    gsub(/ř/,  "\\&rcaron;");
    gsub(/Ř/,  "\\&Rcaron;");
    gsub(/\}/, "\\&rcub;");
    gsub(/®/,  "\\&reg;");
    gsub(/-/,  "\\&rehy;");
    gsub(/\]/, "\\&rsqb;");
    gsub(/ś/,  "\\&sacute;");
    gsub(/Ś/,  "\\&Sacute;");
    gsub(/š/,  "\\&scaron;");
    gsub(/Š/,  "\\&Scaron;");
    gsub(/ş/,  "\\&scedil;");
    gsub(/Ş/,  "\\&Scedil;");
    gsub(/ŝ/,  "\\&scirc;");
    gsub(/σ/,  "\\&sgr;");
    gsub(/♯/,  "\\&sharp;");
    gsub(/\//, "\\&shilling;");
    gsub(/∼/,  "\\&sim;");
    gsub(/\//, "\\&sol;");
    gsub(/²/,  "\\&sup2;");
    gsub(/ß/,  "\\&szlig;");
    gsub(/ť/,  "\\&tcaron;");
    gsub(/ţ/,  "\\&tcedil;");
    gsub(/τ/,  "\\&tgr;");
    gsub(/þ/,  "\\&thorn;");
    gsub(/Þ/,  "\\&THORN;");
    gsub(/×/,  "\\&times;");
    gsub(/™/,  "\\&trade;");
    gsub(/ú/,  "\\&uacute;");
    gsub(/Ú/,  "\\&Uacute;");
    gsub(/û/,  "\\&ucirc;");
    gsub(/ù/,  "\\&ugrave;");
    gsub(/ū/,  "\\&umacr;");
    gsub(/¨/,  "\\&uml;");
    gsub(/ů/,  "\\&uring;");
    gsub(/ü/,  "\\&uuml;");
    gsub(/Ü/,  "\\&Uuml;");
    gsub(/\|/, "\\&verbar;");
    gsub(/ŵ/,  "\\&wcirc;");
    gsub(/ý/,  "\\&yacute;");
    gsub(/ŷ/,  "\\&ycirc;");
    gsub(/¥/,  "\\&yen;");
    gsub(/ÿ/,  "\\&yuml;");
    gsub(/Ÿ/,  "\\&Yuml;");
    gsub(/ź/,  "\\&zacute;");
    gsub(/Ž/,  "\\&Zcaron;");
    gsub(/ž/,  "\\&zcaron;");
    gsub(/ż/,  "\\&zdot;");
}

答案 1 :(得分:0)

要逃脱最低限度,您可以执行以下操作:

function escapeHtml(t)
{
  # Must do this one first
  gsub(/&/,  "\\&amp;", t);
  gsub(/</,  "\\&lt;", t);
  gsub(/>/,  "\\&gt;", t);
  return t;
}