我编写了一个基于空格分割单词并保留标点符号的函数。
function tokenizeUtterance( utterance )
{
let spilittedUserText = utterance.toString().match( /[\w-']+|[^\w\s]+/g );
console.log(spilittedUserText);
}
假设我有一串文字,如:“HELLO,WORLD”
我想以下列格式提取JSON对象中分割词的相对开始和结束位置。
+-------+-------+-----+
| word | start | end |
+-------+-------+-----+
| HELLO | 0 | 4 |
| , | 5 | 6 |
| WORLD | 7 | 11 |
+-------+-------+-----+
答案 0 :(得分:1)
一种相对简单的方法,假设输出保持输入字符串的顺序是简单地计算每个输出字符串上的字符数或位置数:
function tokenizeUtterance(utterance) {
return utterance.toString().match( /[\w-']+|[^\w\s]+/g );
}
function getStartAndEnd(tokenizedUtterance) {
let counter = 0;
const result = [];
for (const word of tokenizedUtterance) {
const res = {
word,
start: counter,
end: counter + word.length - 1
};
counter += word.length;
result.push(res);
}
return result;
}
这是你得到的:
[
{"word":"HELLO","start":0,"end":4},
{"word":",","start":5,"end":5},
{"word":"WORLD","start":6,"end":10}
]
答案 1 :(得分:1)
以下是使用indexOf
为每个单词执行此操作的方法:
function getResult(utterance){
let spilittedUserText = utterance.toString().match( /[\w-']+|[^\w\s]+/g );
let result = [];
let currenSearchIndex = 0;
for (var i = 0; i < spilittedUserText.length; i++){
let startIndex = utterance.indexOf(spilittedUserText[i], currenSearchIndex);
currenSearchIndex = startIndex;
let resultItem = {
word: spilittedUserText[i],
start: startIndex,
end: startIndex + spilittedUserText[i].length - 1
}
result.push(resultItem);
}
return result;
}
console.log(JSON.stringify(getResult('Hello, world Hello')));
输出结果为:
[
{"word":"Hello","start":0,"end":4},
{"word":",","start":5,"end":5},
{"word":"world","start":7,"end":11}
]
答案 2 :(得分:1)
这里你只需要:
string
拆分为word
和index
对的数组。Array#map()
method返回每个word
的自定义数据。这应该是你的代码:
function tokenizeUtterance(utterance) {
let spilittedUserText = [];
utterance.toString().replace(/[\w-']+|[^\w\s]+/g, function(s, i) {
spilittedUserText.push({
word: s,
index: i
});
});
return spilittedUserText.map(function(w) {
return {
"word": w.word,
"start": w.index,
"end": w.index + w.word.length - 1
};
});
}
<强>演示:强>
function tokenizeUtterance(utterance) {
let spilittedUserText = [];
utterance.toString().replace(/[\w-']+|[^\w\s]+/g, function(s, i) {
spilittedUserText.push({
word: s,
index: i
});
});
return spilittedUserText.map(function(w) {
return {
"word": w.word,
"start": w.index,
"end": w.index + w.word.length - 1
};
});
}
var string = "HELLO, WORLD";
console.log(tokenizeUtterance(string));
答案 3 :(得分:1)
String.replace
为您提供匹配及其偏移量:
str = "HELLO, WORLD foo HELLO";
result = [];
str.replace(/[\w'-]+|[^\w\s]+/g, (word, offset) =>
result.push([word, offset, offset + word.length]));
console.log(result);