以Unicode文本转换unicode实体

时间:2015-10-24 08:33:57

标签: shell batch-file cmd cygwin

我有一个带有unicode实体的文本文件。有没有办法通过cmd / batch或cygwin将所有这些实体转换为文本中的等效实体。我似乎无法找到关于此的更多信息。

journal\u0027s将成为journal's

1 个答案:

答案 0 :(得分:1)

使用.bat扩展名保存,例如decodeStrings.bat:

0</* :
@echo off

    cscript /nologo /E:jscript "%~f0" %*

exit /b %errorlevel% */0;


    var jsEscapes = {
      'n': '\n',
      'r': '\r',
      't': '\t',
      'f': '\f',
      'v': '\v',
      'b': '\b'
    };


    //string evaluation
    //http://stackoverflow.com/questions/24294265/how-to-re-enable-special-character-sequneces-in-javascript

    function decodeJsEscape(_, hex0, hex1, octal, other) {
      var hex = hex0 || hex1;
      if (hex) { return String.fromCharCode(parseInt(hex, 16)); }
      if (octal) { return String.fromCharCode(parseInt(octal, 8)); }
      return jsEscapes[other] || other;
    }

    function decodeJsString(s) {
      return s.replace(
          // Matches an escape sequence with UTF-16 in group 1, single byte hex in group 2,
          // octal in group 3, and arbitrary other single-character escapes in group 4.
          /\\(?:u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([0-3][0-7]{0,2}|[4-7][0-7]?)|(.))/g,
          decodeJsEscape);
    }

    var ARGS = WScript.Arguments;

    for (var i=0;i<ARGS.Length;i++) {
        WScript.Echo(decodeJsString(ARGS(i)));
    }

并使用它:

call decodeStrings.bat  journal\u0027s journal\u0027s

输出:

journal's
journal's

这是一个评估文件的脚本(只需传递尽可能多的文件):

0</* :
@echo off

    cscript /nologo /E:jscript "%~f0" %*

exit /b %errorlevel% */0;

    var ARGS = WScript.Arguments;

    if (ARGS.Length < 1 ) {
        WScript.Echo("Wrong arguments");
        WScript.Quit(1);
    }

    if (ARGS.Item(0).toLowerCase() == "-help" || ARGS.Item(0).toLowerCase() == "-h") {
        WScript.Echo("Evaluates unicode/special sequences in file");
        WScript.Echo(WScript.ScriptName + " path_to_file [path_to_file]");
        WScript.Quit(0);

    }


    var jsEscapes = {
      'n': '\n',
      'r': '\r',
      't': '\t',
      'f': '\f',
      'v': '\v',
      'b': '\b'
    };


    //string evaluation
    //http://stackoverflow.com/questions/24294265/how-to-re-enable-special-character-sequneces-in-javascript

    function decodeJsEscape(_, hex0, hex1, octal, other) {
      var hex = hex0 || hex1;
      if (hex) { return String.fromCharCode(parseInt(hex, 16)); }
      if (octal) { return String.fromCharCode(parseInt(octal, 8)); }
      return jsEscapes[other] || other;
    }

    function decodeJsString(s) {
      return s.replace(
          // Matches an escape sequence with UTF-16 in group 1, single byte hex in group 2,
          // octal in group 3, and arbitrary other single-character escapes in group 4.
          /\\(?:u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([0-3][0-7]{0,2}|[4-7][0-7]?)|(.))/g,
          decodeJsEscape);
    }




  function getContent(file) {
        // :: http://www.dostips.com/forum/viewtopic.php?f=3&t=3855&start=15&p=28898  ::
        var ado = WScript.CreateObject("ADODB.Stream");
        ado.Type = 2;  // adTypeText = 2

        ado.CharSet = "iso-8859-1";  // code page with minimum adjustments for input
        ado.Open();
        ado.LoadFromFile(file);

        var adjustment = "\u20AC\u0081\u201A\u0192\u201E\u2026\u2020\u2021" +
                         "\u02C6\u2030\u0160\u2039\u0152\u008D\u017D\u008F" +
                         "\u0090\u2018\u2019\u201C\u201D\u2022\u2013\u2014" +
                         "\u02DC\u2122\u0161\u203A\u0153\u009D\u017E\u0178" ;


        var fs = new ActiveXObject("Scripting.FileSystemObject");
        var size = (fs.getFile(file)).size;

        var lnkBytes = ado.ReadText(size);
        ado.Close();
        var chars=lnkBytes.split('');
        for (var indx=0;indx<size;indx++) {
            if ( chars[indx].charCodeAt(0) > 255 ) {
               chars[indx] = String.fromCharCode(128 + adjustment.indexOf(chars[indx]));
            }
        }
        return chars.join("");
   }

   function writeContent(file,content) {
        var ado = WScript.CreateObject("ADODB.Stream");
        ado.Type = 2;  // adTypeText = 2
        ado.CharSet = "iso-8859-1";  // right code page for output (no adjustments)
        //ado.Mode=2;
        ado.Open();

        ado.WriteText(content);
        ado.SaveToFile(file, 2);
        ado.Close();    
   }

    for (var i=0;i<ARGS.Length;i++) {
        WScript.Echo("Processing: "+ARGS.Item(i));
        var content=getContent(ARGS.Item(i));
        writeContent(ARGS.Item(i) , decodeJsString(content));
    }