用JavaScript计算单词在文档中的开始和结束位置

时间:2018-11-13 16:26:34

标签: javascript text-processing

我有一个文本文档,表示为句子的array,对于每个句子,我都有一个array的单词标记。

我必须针对每个令牌位置计算文档中令牌位置的绝对起点和终点,因此,如果在句子中我有ipsum次五次,则必须在该句子中获得正确的位置每次出现。

我写了这个函数

// calculate begin and end to each token in a sentence
function calculateTokenBeginEnd(textArray) {
  var currentText = [];
  textArray.sentences.forEach(function(sentence) {
    for (var i = 0; i < sentence.tokens.length; ++i) {
      var token = sentence.tokens[i];
      var word = token.word;
      if (i > 0) {
        var thisBegin = token.characterOffsetBegin;
        var previousEnd = sentence.tokens[i - 1].characterOffsetEnd;
        if (thisBegin > previousEnd) {
          currentText.push(' ');
        }
      }
      token.characterOffsetBegin = currentText.length;
      for (var j = 0; j < word.length; ++j) {
        currentText.push(word[j]);
      }
      token.characterOffsetEnd = currentText.length;
    }
    currentText.push('\n');
  });
  return textArray;
} //calculateTokenBeginEnd

但是出了点问题。计算出的characterOffsetBegincharacterOffsetEnd是错误的。 文档具有以下结构

{
    "sentences": [
        {
          "index": 0,
          "text": "Lorem ipsum dolor sit amet,",
          "tokens": [
            {
              "index": 1,
              "word": "Lorem",
              "characterOffsetBegin": 0,
              "characterOffsetEnd": 5
            },
            {
              "index": 2,
              "word": "ipsum",
              "characterOffsetBegin": 5,
              "characterOffsetEnd": 10
            },
    ...
          ]
        },
        {
          "index": 1,
          "text": " consectetur adipiscing elit,",
          "tokens": [
            {
              "index": 1,
              "word": "",
              "characterOffsetBegin": 24,
              "characterOffsetEnd": 24
            },
    ...
    }

这是使用此方法的示例。然后calculateTokenBeginEnd应该计算令牌的开始索引和结束索引,而text2SentencesTokens创建上面的文档结构。 calculateTokenBeginEnd不能按预期工作。

text = "Lorem ipsum dolor sit amet,\n consectetur adipiscing elit,\nsed do eiusmod tempor incididunt\nut labore et dolore magna aliqua.\nUt enim ad minim veniam,\nquis nostrud exercitation ullamco laboris nisi\nut aliquip ex ea commodo consequat.\nDuis aute irure dolor in reprehenderit in voluptate velit esse\ncillum dolore eu fugiat nulla pariatur.\nExcepteur sint occaecat cupidatat non proident,\nLorem ipsum dolor sit amet etwas,\nsunt in culpa qui officia deserunt mollit anim id est laborum"

// to map a text to sentences and tokens
text2SentencesTokens = function(text) {
  var self = this;
  return new Promise((resolve, _) => {
    let sentences = text.split(/\n+/g);
    let sentencesP = sentences.map((sentence, lineIndex) => { // for each sentence
      return new Promise((resolve, _) => {
        let tokens = sentence.split(/\s+/g);
        let tokensP = tokens.map((token, tokenIndex) => { // for each token
          let item = {
            "index": (tokenIndex + 1),
            "word": token
          }
          if (typeof(tokenP) == 'function') {
            return tokenP.apply(self, [item]);
          } else {
            return new Promise((resolve, _) => {
              resolve(item);
            });
          }
        });
        Promise.all(tokensP)
          .then(res => {
            resolve({
              index: lineIndex,
              text: sentence,
              tokens: res
            });
          })
          .catch(err => console.error(err))
      });
    });
    Promise.all(sentencesP)
      .then(res => {
        resolve({
          sentences: res
        })
      })
      .catch(err => console.error(err))
  });
} //text2SentencesTokens

// calculate begin and end to each token in a sentence
function calculateTokenBeginEnd(textArray) {
  var currentText = [];
  textArray.sentences.forEach(function(sentence) {
    for (var i = 0; i < sentence.tokens.length; ++i) {
      var token = sentence.tokens[i];
      var word = token.word;
      if (i > 0) {
        var thisBegin = token.characterOffsetBegin;
        var previousEnd = sentence.tokens[i - 1].characterOffsetEnd;
        if (thisBegin > previousEnd) {
          currentText.push(' ');
        }
      }
      token.characterOffsetBegin = currentText.length;
      for (var j = 0; j < word.length; ++j) {
        currentText.push(word[j]);
      }
      token.characterOffsetEnd = currentText.length;
    }
    currentText.push('\n');
  });
  return textArray;
} //calculateTokenBeginEnd

text2SentencesTokens(text)
  .then(sentences => {
    sentences = calculateTokenBeginEnd(sentences);
    console.log(sentences);

  })

[更新]

根据建议,我将函数重写如下:

   function calculateTokenBeginEnd(textArray) {
        var wordStart=-1;
        for (var j = 0; j < textArray.sentences.length; ++j) {
            var sentence=textArray.sentences[j];
            wordStart +=1;
            for (var i = 0; i < sentence.tokens.length; ++i) {
                var token = sentence.tokens[i];
                var word = token.word;
                var wordRegex = new RegExp("\\b(" + word + ")\\b", "gi");
                var match = wordRegex.exec(sentence.text);
                var previousEnd = 0;
                wordStart += match.index + previousEnd;
                var wordEnd = wordStart + word.length - 1;
                token.characterOffsetBegin = wordStart;
                token.characterOffsetEnd = wordEnd;
            }
        }
    }//calculateTokenBeginEnd

有更好的解决方案吗?

[UPDATE 2] 我已经根据建议的解决方案更新了text2SentencesTokens。问题在于,当一个或多个句子中有相同token的多个匹配项时,此解决方案将无法正常工作,因为它将覆盖最后一个匹配位置的开始和结束位置,因此标记{{1 }}将获得最后匹配的位置:

down

在第一个句子的第一次出现时,它应该具有第一个匹配的位置。

   {
      "index": 2,
      "word": "down",
      "characterOffsetBegin": 70,
      "characterOffsetEnd": 73
    }

1 个答案:

答案 0 :(得分:1)

这可能是计算句子中单词开头/结尾的一种更简便的方法,希望对您有所帮助

var word = "Lorem";
var reg = RegExp(word, 'g');
var sentence = "Lore ipsum Lorem dolor sit Lorem amet,";
var match;

console.log(sentence);
console.log(word);

while ((match = reg.exec(sentence)) !== null) {
  var wordStart = match.index;
  var wordEnd = wordStart + word.length - 1;
  console.log(wordStart + ' -start index');
  console.log(word.length + ' -length of word');
  console.log(wordEnd + ' -last character index, need to +1 to use with substring');
  console.log(sentence.substring(wordStart, wordEnd + 1) + '-using substring with calculated to find the word and verify');
}