如何将字符串分成特定字节大小的块?

时间:2019-07-17 05:07:47

标签: javascript node.js

我正在与一个api进行交互,该api接受最大5KB的字符串。

我想要一个可能大于5KB的字符串,并将其分成小于5KB的块。

然后,我打算将每个smaller-than-5kb-string传递给api端点,并在所有请求完成后执行进一步的操作,可能使用以下方法:

await Promise.all([get_thing_from_api(string_1), get_thing_from_api(string_2), get_thing_from_api(string_3)])

我已经读过一个字符串中的字符可以在1-4个字节之间。

由于这个原因,我们可以使用以下方式计算字符串长度(以字节为单位):

// in Node, string is UTF-8    
Buffer.byteLength("here is some text"); 

// in Javascript  
new Blob(["here is some text"]).size

资料来源:
https://stackoverflow.com/a/56026151
https://stackoverflow.com/a/52254083

我对"how to split strings into chunks of a certain size"的搜索返回的结果与将字符串分成特定字符长度而不是字节长度的字符串有关,例如:

var my_string = "1234 5 678905";

console.log(my_string.match(/.{1,2}/g));
// ["12", "34", " 5", " 6", "78", "90", "5"]

资料来源:
https://stackoverflow.com/a/7033662
https://stackoverflow.com/a/6259543
https://gist.github.com/hendriklammers/5231994

问题

是否可以将字符串拆分为特定字节长度的字符串?

可以

  • 假定字符串每个字符仅包含1个字节
  • 允许“最坏情况”中的每个字符为4个字节

,但希望使用更准确的解决方案。

如果有Node和Plain JavaScript解决方案,我将很感兴趣。

编辑

这种计算byteLength的方法可能会有所帮助-通过遍历字符串中的字符,获取其字符代码并相应地递增byteLength

function byteLength(str) {
  // returns the byte length of an utf8 string
  var s = str.length;
  for (var i=str.length-1; i>=0; i--) {
    var code = str.charCodeAt(i);
    if (code > 0x7f && code <= 0x7ff) s++;
    else if (code > 0x7ff && code <= 0xffff) s+=2;
    if (code >= 0xDC00 && code <= 0xDFFF) i--; //trail surrogate
  }
  return s;
}

来源:https://stackoverflow.com/a/23329386

这使我对underlying data structures of Buffer进行了有趣的实验:

var buf = Buffer.from('Hey! ф');
// <Buffer 48 65 79 21 20 d1 84>  
buf.length // 7
buf.toString().charCodeAt(0) // 72
buf.toString().charCodeAt(5) // 1092  
buf.toString().charCodeAt(6) // NaN    
buf[0] // 72
for (let i = 0; i < buf.length; i++) {
  console.log(buf[i]);
}
// 72 101 121 33 32 209 132 undefined
buf.slice(0,5).toString() // 'Hey! '
buf.slice(0,6).toString() // 'Hey! �'
buf.slice(0,7).toString() // 'Hey! ф'

但是正如@trincot在评论中指出的那样,处理多字节字符的正确方法是什么?以及如何确保大块在空格上分开(以免“分开”一个词?)

有关缓冲区的更多信息:https://nodejs.org/api/buffer.html#buffer_buffer

编辑

如果它可以帮助其他人理解已接受的答案中的精妙逻辑,则以下代码段是我撰写的经过大量评论的版本,因此我可以更好地理解它。

/**
 * Takes a string and returns an array of substrings that are smaller than maxBytes.  
 *
 * This is an overly commented version of the non-generator version of the accepted answer, 
 * in case it helps anyone understand its (brilliant) logic.  
 *
 * Both plain js and node variations are shown below - simply un/comment out your preference  
 * 
 * @param  {string} s - the string to be chunked  
 * @param  {maxBytes} maxBytes - the maximum size of a chunk, in bytes   
 * @return {arrray} - an array of strings less than maxBytes (except in extreme edge cases)    
 */
function chunk(s, maxBytes) {
  // for plain js  
  const decoder = new TextDecoder("utf-8");
  let buf = new TextEncoder("utf-8").encode(s);
  // for node
  // let buf = Buffer.from(s);
  const result = [];
  var counter = 0;
  while (buf.length) {
    console.log("=============== BEG LOOP " + counter + " ===============");
    console.log("result is now:");
    console.log(result);
    console.log("buf is now:");
    // for plain js
    console.log(decoder.decode(buf));
    // for node  
    // console.log(buf.toString());
    /* get index of the last space character in the first chunk, 
    searching backwards from the maxBytes + 1 index */
    let i = buf.lastIndexOf(32, maxBytes + 1);
    console.log("i is: " + i);
    /* if no space is found in the first chunk,
    get index of the first space character in the whole string,
    searching forwards from 0 - in edge cases where characters
    between spaces exceeds maxBytes, eg chunk("123456789x 1", 9),
    the chunk will exceed maxBytes */
    if (i < 0) i = buf.indexOf(32, maxBytes);
    console.log("at first condition, i is: " + i);
    /* if there's no space at all, take the whole string,
    again an edge case like chunk("123456789x", 9) will exceed maxBytes*/
    if (i < 0) i = buf.length;
    console.log("at second condition, i is: " + i);
    // this is a safe cut-off point; never half-way a multi-byte
    // because the index is always the index of a space    
    console.log("pushing buf.slice from 0 to " + i + " into result array");
    // for plain js
    result.push(decoder.decode(buf.slice(0, i)));
    // for node
    // result.push(buf.slice(0, i).toString());
    console.log("buf.slicing with value: " + (i + 1));
    // slice the string from the index + 1 forwards  
    // it won't erroneously slice out a value after i, because i is a space  
    buf = buf.slice(i + 1); // skip space (if any)
    console.log("=============== END LOOP " + counter + " ===============");
    counter++;
  }
  return result;
}

console.log(chunk("Hey there! € 100 to pay", 12));

1 个答案:

答案 0 :(得分:1)

使用Buffer似乎确实是正确的方向。鉴于:

  • Buffer原型具有indexOflastIndexOf方法,并且
  • 32是空格的ASCII代码,并且
  • 32永远不能作为多字节字符的一部分出现,因为构成多字节序列always have the most significant bit set的所有字节。

...您可以按照以下步骤操作:

function chunk(s, maxBytes) {
    let buf = Buffer.from(s);
    const result = [];
    while (buf.length) {
        let i = buf.lastIndexOf(32, maxBytes+1);
        // If no space found, try forward search
        if (i < 0) i = buf.indexOf(32, maxBytes);
        // If there's no space at all, take the whole string
        if (i < 0) i = buf.length;
        // This is a safe cut-off point; never half-way a multi-byte
        result.push(buf.slice(0, i).toString());
        buf = buf.slice(i+1); // Skip space (if any)
    }
    return result;
}

console.log(chunk("Hey there! € 100 to pay", 12)); 
// -> [ 'Hey there!', '€ 100 to', 'pay' ]

您可以考虑将其扩展为也将TAB,LF或CR视为拆分字符。如果是这样,并且您的输入文本可以包含CRLF序列,则还需要检测这些序列,以避免在块中出现孤立的CR或LF字符。

您可以将上述函数转换为生成器,以便控制何时开始获取下一个块的处理:

function * chunk(s, maxBytes) {
    let buf = Buffer.from(s);
    while (buf.length) {
        let i = buf.lastIndexOf(32, maxBytes+1);
        // If no space found, try forward search
        if (i < 0) i = buf.indexOf(32, maxBytes);
        // If there's no space at all, take all
        if (i < 0) i = buf.length;
        // This is a safe cut-off point; never half-way a multi-byte
        yield buf.slice(0, i).toString();
        buf = buf.slice(i+1); // Skip space (if any)
    }
}

for (let s of chunk("Hey there! € 100 to pay", 12)) console.log(s);

浏览器

Buffer特定于Node。但是,浏览器实现了TextEncoder and TextDecoder,这导致了类似的代码:

function * chunk(s, maxBytes) {
    const decoder = new TextDecoder("utf-8");
    let buf = new TextEncoder("utf-8").encode(s);
    while (buf.length) {
        let i = buf.lastIndexOf(32, maxBytes+1);
        // If no space found, try forward search
        if (i < 0) i = buf.indexOf(32, maxBytes);
        // If there's no space at all, take all
        if (i < 0) i = buf.length;
        // This is a safe cut-off point; never half-way a multi-byte
        yield decoder.decode(buf.slice(0, i));
        buf = buf.slice(i+1); // Skip space (if any)
    }
}

for (let s of chunk("Hey there! € 100 to pay", 12)) console.log(s);