将FileReader.readAsBinaryString转换为Unicode转义字符串

时间:2015-03-12 10:33:29

标签: javascript unicode filereader

FileReader。readAsBinaryString()返回UTF-8编码的二进制字符串。如何将数据作为一系列Unicode转义序列(\ uxxxx)来获取?

4 个答案:

答案 0 :(得分:4)

不推荐使用FileReader.readAsBinaryString() - 而是使用readAsArrayBuffer()。这允许您使用以下两种方法之一将输入字符串转换为转义的unicode字符:

方法1

它使用ArrayBuffer和Uint8Array视图。在下面的演示中,假设预加载缓冲区(而是提供一些虚拟数据)。

var buffer = new Uint8Array([0x20, 0xac, 0x2b, 0x08]),  // big-endian format
    pos = 0, txt = "";

// iterate buffer byte-per-byte and build string:
while(pos < buffer.length)
    txt += "\\u" + toString(buffer[pos++]) + toString(buffer[pos++]);

// make sure we end up with two digits (v < 0x10)
function toString(v) {
  var s = v.toString(16); return s.length === 1 ? "0" + s : s
}

out.innerHTML = txt;
<output id="out"></output>

如果数据是little-endian(在大多数主流系统上),你可以在这里使用Uint16Array,只需将单个值转换为字符串而不是两个。或者使用DataView,以便您可以阅读请求endianess。这可能会或可能不会稍快(浏览器将进行字节交换,我们在一个操作系统中读取16位,但检查只是在toString方法中合并):

var data = new Uint8Array([0x20, 0xac, 0x2b, 0x08]),  // big-endian format
    view = new DataView(data.buffer),                 // use a view on the ArrayBuffer
    pos = 0, txt = "";

// iterate buffer byte-per-byte and build string:
while(pos < view.byteLength) {
    txt += "\\u" + toString(view.getUint16(pos, false)); // true = little endian
    pos += 2
};

// make sure we end up with four digits
function toString(v) {
  var s = v.toString(16); 
  return s.length === 3 ? "0" + s : (s.length === 2 ? "00" + s : s)
}

out.innerHTML = txt;
<output id="out"></output>

方法2

这使用新的TextDecoder API来解析输入缓冲区 - 这里也假设是一个ArrayBuffer。

然后escape与替换一起使用。这是一种快速转换方式,但也不推荐使用escape()。但是,它不会很快到任何地方,所以如果你觉得大胆,它可能是一个选项 - 我会在任何情况下把它包括在内:

var td = new TextDecoder("utf-16be"),                   // be = big endian, def: le
    buffer = new Uint8Array([0x20, 0xac, 0x2b, 0x08]);  // big-endian format

// assumes data loaded into an ArrayBuffer
var txt = td.decode(buffer);

// escape is deprecated but won't go anywhere for a while:
out.innerHTML = escape(txt).replace(/%/g, "\\");

// or use the same last step as in method 1, just showing an alternative way

//=> "\u20AC\u2B08"
<output id="out"></output>

注意:您可能已经注意到我已经为字节顺序指示了big-endian。通常,在从网络读取文件或二进制数据时使用big-endian(也称为网络顺序)。如果数据恰好是小端格式,则需要交换字节顺序:

对于方法1 ,您可以这样做:

while(pos < buffer.length) {
    txt += "\\u" + toString(buffer[pos+1]) + toString(buffer[pos]);
    pos += 2;
}

或者仅使用Uint16Array和上面提到的修改过的toString方法。

对于方法2 ,您只需为utf-16指定little-endian版本:

var td = new TextDecoder("utf-16");  // default = little-endian

请注意,TextDecoder尚未稳定 nor supported in all browsers

答案 1 :(得分:3)

我想我可能已经为您找到了解决方案。

On this site我发现了一个开源项目,它将文本转换为Unicode表示法。

我编辑了与项目相关的函数,并创建了一个小函数来处理打开的文件。

 /*
Copyright (C) 2007  Richard Ishida ishida@w3.org
This program is free software; you can redistribute it and/or modify it under the terms 
of the GNU General Public License as published by the Free Software Foundation; either 
version 2 of the License, or (at your option) any later version as long as you point to 
http://rishida.net/ in your code.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
See the GNU General Public License for more details. http://www.gnu.org/licenses/gpl.html
*/

 function dec2hex(textString) {
     return (textString + 0).toString(16).toUpperCase();
 }

 function convertCharStr2Unicode(textString, preserve, pad) {
     // converts a string of characters to U+... notation, separated by space
     // textString: string, the string to convert
     // preserve: string enum [ascii, latin1], a set of characters to not convert
     // pad: boolean, if true, hex numbers lower than 1000 are padded with zeros
     var haut = 0;
     var n = 0;
     var CPstring = '';
     for (var i = 0; i < textString.length; i++) {
         var b = textString.charCodeAt(i);
         if (b < 0 || b > 0xFFFF) {
             CPstring += 'Error in convertChar2CP: byte out of range ' + dec2hex(b) + '!';
         }
         if (haut != 0) {
             if (0xDC00 <= b && b <= 0xDFFF) {
                 CPstring += dec2hex(0x10000 + ((haut - 0xD800) << 10) + (b - 0xDC00)) + ' ';
                 haut = 0;
                 continue;
             } else {
                 CPstring += 'Error in convertChar2CP: surrogate out of range ' + dec2hex(haut) + '!';
                 haut = 0;
             }
         }
         if (0xD800 <= b && b <= 0xDBFF) {
             haut = b;
         } else {
             if (b <= 127 && preserve == 'ascii') {
                 CPstring += textString.charAt(i) + ' ';
             } else if (b <= 255 && preserve == 'latin1') {
                 CPstring += textString.charAt(i) + ' ';
             } else {
                 cp = dec2hex(b);
                 if (pad) {
                     while (cp.length < 4) {
                         cp = '0' + cp;
                     }
                 }
                 CPstring += '\\u' + cp + ' ';
             }
         }
     }
     return CPstring.substring(0, CPstring.length - 1);
 }

演示(使用文件处理和文本输入):http://jsfiddle.net/howderek/btp6zd50/

答案 2 :(得分:2)

此函数将转义非ascii字符并将unicode转换回\ uHHHH。

function ascii(str) {
  var s = ""

  for (var i = 0, len = str.length; i < len; i++) {
    var n = str.charCodeAt(i);
    if (n >= 32 && n <= 126) {
      // printable ASCII
      s += str.charAt(i);
    } else {
      // unicode escape everything else
      n = n.toString(16);
      n = "0000".substr(n.length) + n;
      s+= "\\u" + n;
    }
  }
  return s
}

您可能希望保留文本格式或将其编码为\ n,\ r,\ t。如果需要编辑文本可能会有所帮助。

var x = "♫\n☆\n⚛\n☯\n⚓\n";
console.log(x);

function ascii(str, formatting, convert) {
  var s = ""
  var TAB = 9, LF = 10, CR = 13;
  for (var i = 0, len = str.length; i < len; i++) {
    var n = str.charCodeAt(i);
    if (n >= 32 && n <= 126) {
      // printable ASCII
      s += str.charAt(i);
    } else if(formatting === true && (n === TAB || n === LF || n === CR)) {
      if (convert === true) {
        s += n === TAB ? "\\t" : n === LF ? "\\n" : "\\r";
      } else {
        s += str.charAt(i);
      }
    } else {
      // unicode escape everything else
      n = n.toString(16);
      n = "0000".substr(n.length) + n;
      s+= "\\u" + n;
    }
  }
  return s;
}

console.log(ascii(x));
console.log(ascii(x, true));
console.log(ascii(x, true, true));

输出:

♫
☆
⚛
☯
⚓

\u266b\u000a\u2606\u000a\u269b\u000a\u262f\u000a\u2693\u000a

\u266b
\u2606
\u269b
\u262f
\u2693

\u266b\n\u2606\n\u269b\n\u262f\n\u2693\n

答案 3 :(得分:0)

FileReader.readAsBinaryString()不在 W3C File API working draft

使用Blob

xhr.responseType = 'blob';

在下面的代码片段中,我获取了一个图像,但您可以获得任何二进制文件类型。

var fileDisplayArea= document.getElementById('fileDisplayArea');
var testFrame= document.getElementById('testFrame');

window.URL = window.URL || window.webkitURL;  // Take care of vendor prefixes.

var xhr = new XMLHttpRequest();
// console.log('has CORS',("withCredentials" in xhr));
xhr.open('GET', 'http://i.imgur.com/Ar1SBEH.png', true);
xhr.responseType = 'blob';

xhr.onload = function(e) {
  if (this.status == 200) {
    var blob = this.response; // becomes type Blob
    console.log(blob);
    
    var url = window.URL.createObjectURL(blob);
    testFrame.src = url;
    
    var img = document.createElement('img');
    img.onload = function(e) {
      window.URL.revokeObjectURL(img.src); // Clean up after yourself.
    };
    img.src = window.URL.createObjectURL(blob);
    fileDisplayArea.appendChild(img);
    
    
  }
};

xhr.send();
<iframe id="testFrame" src=""></iframe>
<div id="fileDisplayArea"></div>