将字符串拆分为成对,三元组,四元组和开启(ngrams)?

时间:2018-02-16 12:18:44

标签: javascript string split nlp n-gram

我想将自然文本拆分为单词对,三元组,四元组和开启!

到目前为止,我已经弄明白了如何分成对。我假设我需要一个额外的循环来容纳字数

以下是对

的代码



var test = "I love you so much, but Joe said \"he doesn't\"!";
var words = test.split(" ");
var two_words = [];
for (var i = 0; i < words.length - 1; i++) {
  two_words.push(words[i] + ' ' + words[i + 1]);
}
console.log(two_words);

// Here is what I am trying

var words = test.split(" ");
var split_words = [];
var split_length = 5;
for (var l = 2; l <= split_length; l++) {
  for (var i = 0; i < words.length - (l - 1); i++) {
    var split_word;
    for (c = 0; c <= l; c++) {
      split_word += split_words[i + c];
    }
    split_words.push(split_word);
  }
}

console.log(split_words);
&#13;
&#13;
&#13;

添加预期输出...(ngrams数组)sg就像这样

// 2grams
"I love"
"love you"
"you so"
"so much,"
"much, but"
"but Joe"
"Joe said"
"said "he"
""he doesn't"!"
//3grams
"I love you"
"love you so"
"you so much"
"so much, but"
//and on and on

4 个答案:

答案 0 :(得分:2)

您可以使用像lodash这样的库来大大缩短您的代码:

var word = 'foobarbaz';
var chunks = _.chunk(word, 2).map((chunk) => chunk.join(''));
console.log(chunks); //[ 'fo', 'ob', 'ar', 'ba', 'z' ]

然后您可以传递2以外的值以满足您的需求

答案 1 :(得分:2)

假设您所需的结果不包含混乱的有序组合,您可以尝试按照

&#13;
&#13;
// Code goes here

var test = "I love you so much, but Joe said \"he doesn't\"!";
var arr = test.split(" ");


var words = arr.length; // total length of words
var result = [];

function process(arr, length) { // process array for number of words
  var temp = [];
  // use equal if want to include the complete string as well in the array
  if (arr.length >= length) { 
    // the check excludes any left over words which do not meet the length criteria 
    for (var i = 0; (i + length) <= arr.length; i++) { 
      temp.push(arr.slice(i, length + i).join(" "));
    }

    result.push(temp);
    process(arr, length + 1); // recursive calling
  }

}

process(arr, 2);

console.log(result);
&#13;
&#13;
&#13;

答案 2 :(得分:2)

这被称为&#34; n-gram&#34;并且可以使用generators这样在现代JavaScript中完成:

&#13;
&#13;
function* ngrams(a, n) { 
    let buf = [];

    for (let x of a) {
        buf.push(x);
        if (buf.length === n) {
            yield buf;
            buf.shift();
        }
    }
}


var test = "The quick brown fox jumps over the lazy dog";

for (let g of ngrams(test.split(' '), 3))
    console.log(g.join(' '))
&#13;
&#13;
&#13;

另一个更简洁,可能更快的选择:

let ngrams = (a, n) => a.slice(0, 1 - n).map((_, i) => a.slice(i, i + n));

答案 3 :(得分:2)

这应该做你想要的:

function chunkIt(str,chunk) {
  var words = str.split(" ");
  var arr = [];
  for (var i = (chunk - 1); i < words.length; i++) {
    var start = i - (chunk - 1);
    arr.push(words.slice(start, start + chunk));
  }
  return arr.map(v => v.join(" "));
}  


var test = "I love you so much, but Joe said \"he doesn't\"!";
console.log(chunkIt(test,2));
console.log(chunkIt(test,3));
console.log(chunkIt(test,4));