我有一个字符串和start和length来提取子字符串。两个位置(开始和长度)都基于原始UTF8字符串中的字节偏移量。
然而,有一个问题:
开头和长度以字节为单位,所以我不能使用“substring”。 UTF8字符串包含多个多字节字符。这样做是否有超高效的方法? (我不需要解码字节......)
实施例: var orig ='你好吗?'
s,e可能是3,3来提取第二个字符(好)。我正在寻找
var result = orig.substringBytes(3,3);
帮助!
更新#1 在C / C ++中,我只是将其转换为字节数组,但不确定javascript中是否存在等价物。顺便说一句,是的,我们可以将它解析成一个字节数组并将其解析回一个字符串,但似乎应该有一个快速的方法在正确的地方剪切它。想象一下'orig'是1000000个字符,s = 6个字节,l = 3个字节。
更新#2 由于zerkms有用的重定向,我最终得到了以下内容, NOT 正常工作 - 适用于多字节但是搞砸了单个字节。
function substrBytes(str, start, length)
{
var ch, startIx = 0, endIx = 0, re = '';
for (var i = 0; 0 < str.length; i++)
{
startIx = endIx++;
ch = str.charCodeAt(i);
do {
ch = ch >> 8; // a better way may exist to measure ch len
endIx++;
}
while (ch);
if (endIx > start + length)
{
return re;
}
else if (startIx >= start)
{
re += str[i];
}
}
}
更新#3 我不认为转移char代码确实有效。当正确的答案是三个时,我正在读两个字节...不知怎的,我总是忘记这一点。 UTF8和UTF16的代码点是相同的,但编码时占用的字节数取决于编码!所以这不是正确的方法。
答案 0 :(得分:8)
我玩得很开心。希望这会有所帮助。
由于Javascript不允许对字符串进行直接字节访问,因此查找起始位置的唯一方法是正向扫描。
更新#3 我不认为转移char代码确实有效。当正确的答案是三个时,我正在读两个字节...不知怎的,我总是忘记这一点。 UTF8和UTF16的代码点是相同的,但编码时占用的字节数取决于编码!所以这不是正确的方法。
这是不正确的 - 实际上javascript中没有UTF-8字符串。根据ECMAScript 262规范,所有字符串 - 无论输入编码如何 - 必须在内部存储为UTF-16(“[序列] 16位无符号整数”)。
考虑到这一点,8位移位是正确的(但不必要)。
错误的假设是您的角色存储为3字节序列...
实际上,JS(ECMA-262)字符串中的所有字符都是16位(2字节)长。
这可以通过手动将多字节字符转换为utf-8来解决,如下面的代码所示。
请参阅我的示例代码中解释的详细信息:
function encode_utf8( s )
{
return unescape( encodeURIComponent( s ) );
}
function substr_utf8_bytes(str, startInBytes, lengthInBytes) {
/* this function scans a multibyte string and returns a substring.
* arguments are start position and length, both defined in bytes.
*
* this is tricky, because javascript only allows character level
* and not byte level access on strings. Also, all strings are stored
* in utf-16 internally - so we need to convert characters to utf-8
* to detect their length in utf-8 encoding.
*
* the startInBytes and lengthInBytes parameters are based on byte
* positions in a utf-8 encoded string.
* in utf-8, for example:
* "a" is 1 byte,
"ü" is 2 byte,
and "你" is 3 byte.
*
* NOTE:
* according to ECMAScript 262 all strings are stored as a sequence
* of 16-bit characters. so we need a encode_utf8() function to safely
* detect the length our character would have in a utf8 representation.
*
* http://www.ecma-international.org/publications/files/ecma-st/ECMA-262.pdf
* see "4.3.16 String Value":
* > Although each value usually represents a single 16-bit unit of
* > UTF-16 text, the language does not place any restrictions or
* > requirements on the values except that they be 16-bit unsigned
* > integers.
*/
var resultStr = '';
var startInChars = 0;
// scan string forward to find index of first character
// (convert start position in byte to start position in characters)
for (bytePos = 0; bytePos < startInBytes; startInChars++) {
// get numeric code of character (is >128 for multibyte character)
// and increase "bytePos" for each byte of the character sequence
ch = str.charCodeAt(startInChars);
bytePos += (ch < 128) ? 1 : encode_utf8(str[startInChars]).length;
}
// now that we have the position of the starting character,
// we can built the resulting substring
// as we don't know the end position in chars yet, we start with a mix of
// chars and bytes. we decrease "end" by the byte count of each selected
// character to end up in the right position
end = startInChars + lengthInBytes - 1;
for (n = startInChars; startInChars <= end; n++) {
// get numeric code of character (is >128 for multibyte character)
// and decrease "end" for each byte of the character sequence
ch = str.charCodeAt(n);
end -= (ch < 128) ? 1 : encode_utf8(str[n]).length;
resultStr += str[n];
}
return resultStr;
}
var orig = 'abc你好吗?';
alert('res: ' + substr_utf8_bytes(orig, 0, 2)); // alerts: "ab"
alert('res: ' + substr_utf8_bytes(orig, 2, 1)); // alerts: "c"
alert('res: ' + substr_utf8_bytes(orig, 3, 3)); // alerts: "你"
alert('res: ' + substr_utf8_bytes(orig, 6, 6)); // alerts: "好吗"
答案 1 :(得分:6)
@Kaii的答案几乎是正确的,但它有一个错误。它无法处理Unicode为128到255的字符。 这是修订版(只需更改256到128):
function encode_utf8( s )
{
return unescape( encodeURIComponent( s ) );
}
function substr_utf8_bytes(str, startInBytes, lengthInBytes) {
/* this function scans a multibyte string and returns a substring.
* arguments are start position and length, both defined in bytes.
*
* this is tricky, because javascript only allows character level
* and not byte level access on strings. Also, all strings are stored
* in utf-16 internally - so we need to convert characters to utf-8
* to detect their length in utf-8 encoding.
*
* the startInBytes and lengthInBytes parameters are based on byte
* positions in a utf-8 encoded string.
* in utf-8, for example:
* "a" is 1 byte,
"ü" is 2 byte,
and "你" is 3 byte.
*
* NOTE:
* according to ECMAScript 262 all strings are stored as a sequence
* of 16-bit characters. so we need a encode_utf8() function to safely
* detect the length our character would have in a utf8 representation.
*
* http://www.ecma-international.org/publications/files/ecma-st/ECMA-262.pdf
* see "4.3.16 String Value":
* > Although each value usually represents a single 16-bit unit of
* > UTF-16 text, the language does not place any restrictions or
* > requirements on the values except that they be 16-bit unsigned
* > integers.
*/
var resultStr = '';
var startInChars = 0;
// scan string forward to find index of first character
// (convert start position in byte to start position in characters)
for (bytePos = 0; bytePos < startInBytes; startInChars++) {
// get numeric code of character (is >= 128 for multibyte character)
// and increase "bytePos" for each byte of the character sequence
ch = str.charCodeAt(startInChars);
bytePos += (ch < 128) ? 1 : encode_utf8(str[startInChars]).length;
}
// now that we have the position of the starting character,
// we can built the resulting substring
// as we don't know the end position in chars yet, we start with a mix of
// chars and bytes. we decrease "end" by the byte count of each selected
// character to end up in the right position
end = startInChars + lengthInBytes - 1;
for (n = startInChars; startInChars <= end; n++) {
// get numeric code of character (is >= 128 for multibyte character)
// and decrease "end" for each byte of the character sequence
ch = str.charCodeAt(n);
end -= (ch < 128) ? 1 : encode_utf8(str[n]).length;
resultStr += str[n];
}
return resultStr;
}
var orig = 'abc你好吗?©';
alert('res: ' + substr_utf8_bytes(orig, 0, 2)); // alerts: "ab"
alert('res: ' + substr_utf8_bytes(orig, 2, 1)); // alerts: "c"
alert('res: ' + substr_utf8_bytes(orig, 3, 3)); // alerts: "你"
alert('res: ' + substr_utf8_bytes(orig, 6, 6)); // alerts: "好吗"
alert('res: ' + substr_utf8_bytes(orig, 15, 2)); // alerts: "©"
顺便说一句,这是一个错误修复,它应该对那些有同样问题的人有用。由于“太多”或“太小”的变化,审稿人为什么拒绝我的编辑建议? @Adam Eberlin @Kjuly @Jasonw
答案 2 :(得分:2)
function substrBytes(str, start, length)
{
var buf = new Buffer(str);
return buf.slice(start, start+length).toString();
}
AYB
答案 3 :(得分:0)
System.ArraySegment很有用,但你需要带有数组输入和偏移量和索引器的构造函数。
答案 4 :(得分:0)
也许用这个来计算字节和例子。它计算你的字符是2个字节,而不是3个字节跟随@Kaii的函数:
jQuery.byteLength = function(target) {
try {
var i = 0;
var length = 0;
var count = 0;
var character = '';
//
target = jQuery.castString(target);
length = target.length;
//
for (i = 0; i < length; i++) {
// 1 文字を切り出し Unicode に変換
character = target.charCodeAt(i);
//
// Unicode の半角 : 0x0 - 0x80, 0xf8f0, 0xff61 - 0xff9f, 0xf8f1 -
// 0xf8f3
if ((character >= 0x0 && character < 0x81)
|| (character == 0xf8f0)
|| (character > 0xff60 && character < 0xffa0)
|| (character > 0xf8f0 && character < 0xf8f4)) {
// 1 バイト文字
count += 1;
} else {
// 2 バイト文字
count += 2;
}
}
//
return (count);
} catch (e) {
jQuery.showErrorDetail(e, 'byteLength');
return (0);
}
};
for (var j = 1, len = value.length; j <= len; j++) {
var slice = value.slice(0, j);
var slength = $.byteLength(slice);
if ( slength == 106 ) {
$(this).val(slice);
break;
}
}