Question

我编写了一个基于空格分割单词并保留标点符号的函数。

function tokenizeUtterance( utterance )
{
  let spilittedUserText = utterance.toString().match( /[\w-']+|[^\w\s]+/g ); 
  console.log(spilittedUserText);   
}

假设我有一串文字，如：“HELLO，WORLD”

我想以下列格式提取JSON对象中分割词的相对开始和结束位置。

+-------+-------+-----+
| word  | start | end |
+-------+-------+-----+
| HELLO |     0 |   4 |
| ,     |     5 |   6 |
| WORLD |     7 |  11 |
+-------+-------+-----+

Answer 1

一种相对简单的方法，假设输出保持输入字符串的顺序是简单地计算每个输出字符串上的字符数或位置数：

function tokenizeUtterance(utterance) {
  return utterance.toString().match( /[\w-']+|[^\w\s]+/g );  
}

function getStartAndEnd(tokenizedUtterance) {
    let counter = 0;
    const result = [];
    for (const word of tokenizedUtterance) {
        const res = {
            word,
            start: counter,
            end: counter + word.length - 1
        };
        counter += word.length;
        result.push(res);
    }
    return result;
}

这是你得到的：

[
    {"word":"HELLO","start":0,"end":4},
    {"word":",","start":5,"end":5},
    {"word":"WORLD","start":6,"end":10}
]

Answer 2

以下是使用indexOf为每个单词执行此操作的方法：

function getResult(utterance){
  let spilittedUserText = utterance.toString().match( /[\w-']+|[^\w\s]+/g ); 
  let result = [];
  let currenSearchIndex = 0;
  for (var i = 0; i < spilittedUserText.length; i++){
    let startIndex = utterance.indexOf(spilittedUserText[i], currenSearchIndex);
    currenSearchIndex = startIndex;
    let resultItem = {
        word: spilittedUserText[i],
        start: startIndex,
        end: startIndex + spilittedUserText[i].length - 1
    }
    result.push(resultItem);
  }
  return result;
}
console.log(JSON.stringify(getResult('Hello, world Hello')));

输出结果为：

[
 {"word":"Hello","start":0,"end":4},
 {"word":",","start":5,"end":5},
 {"word":"world","start":7,"end":11}
]

Answer 3

这里你只需要：

将string拆分为word和index对的数组。
使用Array#map() method返回每个word的自定义数据。

这应该是你的代码：

function tokenizeUtterance(utterance) {
  let spilittedUserText = [];

  utterance.toString().replace(/[\w-']+|[^\w\s]+/g, function(s, i) {
    spilittedUserText.push({
      word: s,
      index: i
    });
  });
  return spilittedUserText.map(function(w) {
    return {
      "word": w.word,
      "start": w.index,
      "end": w.index + w.word.length - 1
    };
  });
}

<强>演示：

function tokenizeUtterance(utterance) {
  let spilittedUserText = [];

  utterance.toString().replace(/[\w-']+|[^\w\s]+/g, function(s, i) {
    spilittedUserText.push({
      word: s,
      index: i
    });
  });
  return spilittedUserText.map(function(w) {
    return {
      "word": w.word,
      "start": w.index,
      "end": w.index + w.word.length - 1
    };
  });
}
var string = "HELLO, WORLD";

console.log(tokenizeUtterance(string));

Answer 4

String.replace为您提供匹配及其偏移量：

str = "HELLO, WORLD foo HELLO";
result = [];

str.replace(/[\w'-]+|[^\w\s]+/g, (word, offset) => 
    result.push([word, offset, offset + word.length]));

console.log(result);

获取单词的相对开始和结束索引，同时将其拆分为Javascript

4 个答案: