将包含UTF8字符串的变量转换为包含latin1字符串的变量 - 对于Javascript中的浏览器

时间:2017-10-26 19:30:33

标签: javascript browser utf-8 character-encoding iso-8859-1

Bash替代(shell设置为UTF8):

输入:

in.json

$ file -I in.json
in.json: text/plain; charset=utf-8

{ “它-它”: “的Città”}

Bash命令我需要JS替代:

$ iconv -f utf8 -t latin1 in.json > out.json

out.json

$ file -I in.json
out.json: text/plain; charset=iso-8859-1

{ “它-它”: “CITT”}

当从输入类型=“文件”读取为base64时,Javascript在浏览器中看到in.json的内容(尽管内容类型和脚本编码设置为utf8):

{"it-it":"Città"}

Javascript在浏览器中看到out.json:

{"it-it":"Città"}

问题 - 如何以最原生的Javascript方式制作大多数现代浏览器转换此utf8字符串

({"it-it":"Città "} as latin1 and {"it-it":"Città"} as utf8) 

到latin1字符串?

我更喜欢原生解决方案,或者最糟糕的情况是JQuery,请尽量不要使用npm +节点依赖地狱来解决它。

P.s。:我只需要支持最现代的浏览器,这适用于仅限管理员的页面。

2 个答案:

答案 0 :(得分:0)

下面我创建了一个iso-8859-1版本为CittÃ的数组,然后使用TextDecoder对其进行解码。

因此,如果您可以获得JSON的二进制版本,则应该可以为您进行转换。

//CittÃ
var latinSource = new Uint8Array([67, 105, 116, 116, 195]);

var tc = new TextDecoder("iso-8859-1");

console.log(tc.decode(latinSource));

答案 1 :(得分:0)

对我来说,'new TextDecoder("iso-8859-1")' 不起作用...

1.

var latinSource = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]);

var tc = new TextDecoder("iso-8859-1");

console.log(tc.decode(latinSource)); //return windows-1252 string

我明白了,结果,它不是 latin1 字符串,因为它包含字符 '€'。

2.

//windows-1252
console.log('new TextDecoder("iso-8859-1")', new TextDecoder("iso-8859-1"));
// ----> 
//new TextDecoder("iso-8859-1") {
//  "encoding": "windows-1252",
//  "fatal": false,
//  "ignoreBOM": false,
//  "decode": function decode() { [native code] }
//}

  1. 对 latin-1 进行编码-解码的工作方式:

//Decode Latin1-string (iso-8859-1 encoded string) -> into Uint8Array
function Latin1ToUint8Array(iso_8859_1){
    var uInt8Arr = new Uint8Array(iso_8859_1.length);
    for(var i=0; i<iso_8859_1.length; i++){
        uInt8Arr[i] = iso_8859_1.charCodeAt(i);
    }
    return uInt8Arr;
}

//encode Uint8Array -> into iso-8859-1 encoded string (latin1-string)
function Uint8ToLatin1Str(Uint8Arr){
    var iso_8859_1_string = '';
    for(var i=0; i<Uint8Arr.length; i++){iso_8859_1_string+= String.fromCharCode(Uint8Arr[i]);}
    return iso_8859_1_string;
}

var latinSource = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]);

console.log( Uint8ToLatin1Str(latinSource) ); //valid latin1-string (iso-8859-1)

  1. 最后,Windows-1252 转换:

function Windows1252EncodeDecode(
    cp1252  //string (to encode into bytes), or Uint8Array (to decode into string)
){
    var replaceCharCodesForLatin1 = {
        //_______________________________________________________________________
        //|"windows-1252"|      iso-8859-1      |       //Unicode               |
        //|'character'   |      charcode,       |       //charcode(commented),  |
        //|______________|______________________|_______________________________|
            '€'         :       128,                    //8364, 
            '‚'         :       130,                    //8218, 
            'ƒ'         :       131,                    //402, 
            '„'         :       132,                    //8222,
            '…'         :       133,                    //8230, 
            '†'         :       134,                    //8224, 
            '‡'         :       135,                    //8225, 
            'ˆ'         :       136,                    //710, 
            '‰'         :       137,                    //8240, 
            'Š'         :       138,                    //352, 
            '‹'         :       139,                    //8249, 
            'Œ'         :       140,                    //338, 
            'Ž'         :       142,                    //381, 
            '‘'         :       145,                    //8216, 
            '’'         :       146,                    //8217, 
            '“'         :       147,                    //8220, 
            '”'         :       148,                    //8221, 
            '•'         :       149,                    //8226, 
            '–'         :       150,                    //8211, 
            '—'         :       151,                    //8212, 
            '˜'         :       152,                    //732, 
            '™'         :       153,                    //8482, 
            'š'         :       154,                    //353, 
            '›'         :       155,                    //8250, 
            'œ'         :       156,                    //339, 
            'ž'         :       158,                    //382, 
            'Ÿ'         :       159,                    //376
    };
    if(typeof cp1252 === 'string'){ //if that was been string to encode to bytes
        var resultUint8 = new Uint8Array(cp1252.length);
        for(var i = 0; i<cp1252.length; i++){
            var charCode = cp1252[i].charCodeAt(0);
            resultUint8[i] = ((charCode>256) ? replaceCharCodesForLatin1[cp1252[i]] : charCode);
        }
        return resultUint8; //return Uint8Array
    }else if(cp1252 instanceof Uint8Array){ //else if that was been Uint8Array to decode to string
        var resultString = "";
        for(var i = 0; i<cp1252.length; i++){
            var charCode = (Object.keys(replaceCharCodesForLatin1).find(key => replaceCharCodesForLatin1[key] === cp1252[i]));
            charCode = (typeof charCode === 'undefined') ? String.fromCharCode(cp1252[i]) : charCode;
            resultString += charCode;
        }
        return resultString;    //return Uint8Array
    }
}

var latinSource = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]);

var windows1252 = new TextDecoder("iso-8859-1").decode(latinSource); //windows-1252 string on output
console.log('new TextDecoder("iso-8859-1").decode(latinSource)', (new TextDecoder("iso-8859-1").decode(latinSource)))

var bytesBack = Windows1252EncodeDecode(windows1252);
console.log('bytesBack', bytesBack.toString());
var Windows1252StringBack = Windows1252EncodeDecode(bytesBack)
console.log('string back', Windows1252StringBack);
console.log('Compare with TextDecoder', (Windows1252StringBack === windows1252 ));

  1. latin-1 函数的修改,来自 3:

function isLatin1String(str){return (str.match(/[^\u0000-\u00FF]/) === null);} //check is string "iso-8859-1"-encoded or not (true/false)

//Decode Latin1 or utf-8 string -> into Uint8Array
function StringToUint8Array(str){
    
    if(!isLatin1String(str)){
        return new TextEncoder("utf-8").encode(str); //encode to bytes as utf-8
    }
    //else, as ASCII-compatible latin1-string
    var uInt8Arr = new Uint8Array(str.length);
    
    for(var i=0; i<str.length; i++){
        uInt8Arr[i] = str.charCodeAt(i);
    }
    
    return uInt8Arr;
}

//encode Uint8Array -> to latin1-string
function Uint8ToStr(Uint8Arr){
    var iso_8859_1_string = '';
    for(var i=0; i<Uint8Arr.length; i++){iso_8859_1_string+= String.fromCharCode(Uint8Arr[i]);}
    return iso_8859_1_string;
}

function latin1ToUtf8(latin1str){
  return new TextDecoder("utf-8").decode(StringToUint8Array(latin1str));
}

console.log('StringToUint8Array("CittÃ")', StringToUint8Array("CittÃ")); //Latin1
console.log('StringToUint8Array("Città€")', StringToUint8Array("Città€")); //utf-8
console.log('Uint8ToStr(StringToUint8Array("CittÃ"))', Uint8ToStr(StringToUint8Array("CittÃ"))); //latin1
console.log('Uint8ToStr(StringToUint8Array("Città"))', Uint8ToStr(StringToUint8Array("Città€"))); //utf-8
console.log('latin1ToUtf8(Uint8ToStr(StringToUint8Array("Città€")))', latin1ToUtf8(Uint8ToStr(StringToUint8Array("Città€")))); //utf-8