我正在与一个api进行交互,该api接受最大5KB的字符串。
我想要一个可能大于5KB的字符串,并将其分成小于5KB的块。
然后,我打算将每个smaller-than-5kb-string
传递给api端点,并在所有请求完成后执行进一步的操作,可能使用以下方法:
await Promise.all([get_thing_from_api(string_1), get_thing_from_api(string_2), get_thing_from_api(string_3)])
我已经读过一个字符串中的字符可以在1-4个字节之间。
由于这个原因,我们可以使用以下方式计算字符串长度(以字节为单位):
// in Node, string is UTF-8
Buffer.byteLength("here is some text");
// in Javascript
new Blob(["here is some text"]).size
资料来源:
https://stackoverflow.com/a/56026151
https://stackoverflow.com/a/52254083
我对"how to split strings into chunks of a certain size"
的搜索返回的结果与将字符串分成特定字符长度而不是字节长度的字符串有关,例如:
var my_string = "1234 5 678905";
console.log(my_string.match(/.{1,2}/g));
// ["12", "34", " 5", " 6", "78", "90", "5"]
资料来源:
https://stackoverflow.com/a/7033662
https://stackoverflow.com/a/6259543
https://gist.github.com/hendriklammers/5231994
问题
是否可以将字符串拆分为特定字节长度的字符串?
我可以:
,但希望使用更准确的解决方案。
如果有Node和Plain JavaScript解决方案,我将很感兴趣。
编辑
这种计算byteLength
的方法可能会有所帮助-通过遍历字符串中的字符,获取其字符代码并相应地递增byteLength
:
function byteLength(str) {
// returns the byte length of an utf8 string
var s = str.length;
for (var i=str.length-1; i>=0; i--) {
var code = str.charCodeAt(i);
if (code > 0x7f && code <= 0x7ff) s++;
else if (code > 0x7ff && code <= 0xffff) s+=2;
if (code >= 0xDC00 && code <= 0xDFFF) i--; //trail surrogate
}
return s;
}
来源:https://stackoverflow.com/a/23329386
这使我对underlying data structures of Buffer进行了有趣的实验:
var buf = Buffer.from('Hey! ф');
// <Buffer 48 65 79 21 20 d1 84>
buf.length // 7
buf.toString().charCodeAt(0) // 72
buf.toString().charCodeAt(5) // 1092
buf.toString().charCodeAt(6) // NaN
buf[0] // 72
for (let i = 0; i < buf.length; i++) {
console.log(buf[i]);
}
// 72 101 121 33 32 209 132 undefined
buf.slice(0,5).toString() // 'Hey! '
buf.slice(0,6).toString() // 'Hey! �'
buf.slice(0,7).toString() // 'Hey! ф'
但是正如@trincot在评论中指出的那样,处理多字节字符的正确方法是什么?以及如何确保大块在空格上分开(以免“分开”一个词?)
有关缓冲区的更多信息:https://nodejs.org/api/buffer.html#buffer_buffer
编辑
如果它可以帮助其他人理解已接受的答案中的精妙逻辑,则以下代码段是我撰写的经过大量评论的版本,因此我可以更好地理解它。
/**
* Takes a string and returns an array of substrings that are smaller than maxBytes.
*
* This is an overly commented version of the non-generator version of the accepted answer,
* in case it helps anyone understand its (brilliant) logic.
*
* Both plain js and node variations are shown below - simply un/comment out your preference
*
* @param {string} s - the string to be chunked
* @param {maxBytes} maxBytes - the maximum size of a chunk, in bytes
* @return {arrray} - an array of strings less than maxBytes (except in extreme edge cases)
*/
function chunk(s, maxBytes) {
// for plain js
const decoder = new TextDecoder("utf-8");
let buf = new TextEncoder("utf-8").encode(s);
// for node
// let buf = Buffer.from(s);
const result = [];
var counter = 0;
while (buf.length) {
console.log("=============== BEG LOOP " + counter + " ===============");
console.log("result is now:");
console.log(result);
console.log("buf is now:");
// for plain js
console.log(decoder.decode(buf));
// for node
// console.log(buf.toString());
/* get index of the last space character in the first chunk,
searching backwards from the maxBytes + 1 index */
let i = buf.lastIndexOf(32, maxBytes + 1);
console.log("i is: " + i);
/* if no space is found in the first chunk,
get index of the first space character in the whole string,
searching forwards from 0 - in edge cases where characters
between spaces exceeds maxBytes, eg chunk("123456789x 1", 9),
the chunk will exceed maxBytes */
if (i < 0) i = buf.indexOf(32, maxBytes);
console.log("at first condition, i is: " + i);
/* if there's no space at all, take the whole string,
again an edge case like chunk("123456789x", 9) will exceed maxBytes*/
if (i < 0) i = buf.length;
console.log("at second condition, i is: " + i);
// this is a safe cut-off point; never half-way a multi-byte
// because the index is always the index of a space
console.log("pushing buf.slice from 0 to " + i + " into result array");
// for plain js
result.push(decoder.decode(buf.slice(0, i)));
// for node
// result.push(buf.slice(0, i).toString());
console.log("buf.slicing with value: " + (i + 1));
// slice the string from the index + 1 forwards
// it won't erroneously slice out a value after i, because i is a space
buf = buf.slice(i + 1); // skip space (if any)
console.log("=============== END LOOP " + counter + " ===============");
counter++;
}
return result;
}
console.log(chunk("Hey there! € 100 to pay", 12));
答案 0 :(得分:1)
使用Buffer
似乎确实是正确的方向。鉴于:
Buffer
原型具有indexOf
和lastIndexOf
方法,并且...您可以按照以下步骤操作:
function chunk(s, maxBytes) {
let buf = Buffer.from(s);
const result = [];
while (buf.length) {
let i = buf.lastIndexOf(32, maxBytes+1);
// If no space found, try forward search
if (i < 0) i = buf.indexOf(32, maxBytes);
// If there's no space at all, take the whole string
if (i < 0) i = buf.length;
// This is a safe cut-off point; never half-way a multi-byte
result.push(buf.slice(0, i).toString());
buf = buf.slice(i+1); // Skip space (if any)
}
return result;
}
console.log(chunk("Hey there! € 100 to pay", 12));
// -> [ 'Hey there!', '€ 100 to', 'pay' ]
您可以考虑将其扩展为也将TAB,LF或CR视为拆分字符。如果是这样,并且您的输入文本可以包含CRLF序列,则还需要检测这些序列,以避免在块中出现孤立的CR或LF字符。
您可以将上述函数转换为生成器,以便控制何时开始获取下一个块的处理:
function * chunk(s, maxBytes) {
let buf = Buffer.from(s);
while (buf.length) {
let i = buf.lastIndexOf(32, maxBytes+1);
// If no space found, try forward search
if (i < 0) i = buf.indexOf(32, maxBytes);
// If there's no space at all, take all
if (i < 0) i = buf.length;
// This is a safe cut-off point; never half-way a multi-byte
yield buf.slice(0, i).toString();
buf = buf.slice(i+1); // Skip space (if any)
}
}
for (let s of chunk("Hey there! € 100 to pay", 12)) console.log(s);
Buffer
特定于Node。但是,浏览器实现了TextEncoder
and TextDecoder
,这导致了类似的代码:
function * chunk(s, maxBytes) {
const decoder = new TextDecoder("utf-8");
let buf = new TextEncoder("utf-8").encode(s);
while (buf.length) {
let i = buf.lastIndexOf(32, maxBytes+1);
// If no space found, try forward search
if (i < 0) i = buf.indexOf(32, maxBytes);
// If there's no space at all, take all
if (i < 0) i = buf.length;
// This is a safe cut-off point; never half-way a multi-byte
yield decoder.decode(buf.slice(0, i));
buf = buf.slice(i+1); // Skip space (if any)
}
}
for (let s of chunk("Hey there! € 100 to pay", 12)) console.log(s);