Question

假设我在数组中有一组不同的URL：

var source = ['www.xyz.com/Product/1', 'www.xyz.com/Product/3', 'www.xyz.com/Category/1', 'somestring']

迭代数组并将类似的字符串分组到单独的数组中会有什么好方法？上例中的所需输出为：

var output = [
    ['www.xyz.com/Product/1', 'www.xyz.com/Product/3'],
    ['www.xyz.com/Category/1'],
    ['somestring']
];

条件

source中的所有项目都可以是随机字符串
逻辑必须能够在有意义的时间内对100'000个项目进行比较和分组

我找到了string-similarity library，它可以将一个字符串与一组字符串进行比较。一种方法是迭代源，将每个项目与源集合进行比较，并应用规则对具有相似分数的项目进行分组。但是，我想这可能效率很低。

有人可以建议我有效地完成我需要的工作吗？

Answer 1

我能想到的最佳解决方案是将字符串相互比较并测试它们之间的差异。有一种算法可以做到这一点，即Levenshtein distance算法：

Levenshtein距离是用于测量的距离的字符串度量两个序列之间的差异。非正式地，Levenshtein距离两个单词之间是单个字符编辑的最小数量（即插入，删除或替换）需要更改一个说到另一个。

我们可以在fast-levenshtein module：

之上轻松创建Levenshtein过滤器

const levenshtein = require('fast-levenshtein'); 

const levenshteinFilter = (source, maximum = 5) => {
  let _source, matches, x, y;
  _source = source.slice();
  matches = [];
  for (x = _source.length - 1; x >= 0; x--) {
    let output = _source.splice(x, 1);
    for (y = _source.length - 1; y >= 0; y--) {
      if (levenshtein.get(output[0], _source[y]) <= maximum) {
        output.push(_source[y]);
        _source.splice(y, 1);
        x--;
      }
    }
    matches.push(output);
  }
  return matches;
}

let source = ['www.xyz.com/Product/1', 'www.xyz.com/Product/3', 'www.xyz.com/Category/1', 'somestring'];
let output = levenshteinFilter(source);
// [ [ 'www.xyz.com/Product/1', 'www.xyz.com/Product/3' ],
//   [ 'www.xyz.com/Category/1' ],
//   [ 'somestring' ] ]

您可以在函数的2参数中定义最大可接受距离（默认为5）。

Answer 2

你没有充实自己的意图，但如果面临从随机草堆中找到最近邻居的选定项目的任务，我可能会尝试建立一个哈希树。

或者，这可能是作弊，我让图书馆为我做这件事。 lunr.js基本上是一个纯粹的JS lucene索引，我将你的数组推入它并查询它以获得类似的字符串。我之前在lunr.js中拥有相当大的数据集，并且它具有高性能，附近没有一个弹性搜索集群，但仍然令人印象深刻。

如果您提供有关您尝试做的更多详细信息，我可以提供更多详细信息，甚至是一些示例代码。

Answer 3

如果source包含所有随机URL，则下面的函数将给出预期输出，如问题所示。

function filter (source) {
  var output = []
  source.forEach((svalue) => {
    if (output.length === 0) {
      output.push([svalue])
    } else {
      var done = false
      output.forEach((tarr) => {
        if (!done) {
          tarr.forEach((tvalue) => {
            if (svalue.indexOf('/') > -1 && svalue.split('/').slice(0, 2).join('/') == tvalue.split('/').slice(0, 2).join('/')) {
              tarr.push(svalue)
              done = true
            }
          })
        }
      })
      if (!done) {
        output.push([svalue])
        done = true
      }
    }
  })
  return output
}

Answer 4

根据您的示例测试，我建议您实现Radix Tree or Prefix Tree来存储字符串。之后，您可以定义一个标准来聚类这些字符串。

Answer 5

我将 user7560588 的代码修改为基于 Dice 系数的用户字符串相似度，这主要优于 Levenshtein 距离。 https://www.npmjs.com/package/string-similarity。

您可以在 0 - 1 之间调整接受率，1 表示 100% 匹配。所以你设置一个正确的接受值有更好的解决方案。

它的作用是循环数组中的值并比较 2 个字符串，如果匹配则将它们分组。该库还可以将字符串与字符串数组进行比较，并在数组中返回相应的评分。

var stringSimilarity = require("string-similarity");

const stringFilter = (source, rate = 0.85) => {
  let _source, matches, x, y;
  _source = source.slice();
  matches = [];
  for (x = _source.length - 1; x >= 0; x--) {
    let output = _source.splice(x, 1);

    for (y = _source.length - 1; y >= 0; y--) {
      var match = stringSimilarity.compareTwoStrings(output[0], _source[y]);
      console.log(output[0], _source[y], match);
      if (match > rate) {
        output.push(_source[y]);
        _source.splice(y, 1);
        x--;
      }
    }
    matches.push(output);
  }
  return matches;
};

let source = ['www.xyz.com/Product/1', 'www.xyz.com/Product/3', 'www.xyz.com/Category/1', 'somestring'];
let output = stringFilter(source);
console.log(output);

结果

somestring www.xyz.com/Category/1 0.06666666666666667
somestring www.xyz.com/Product/3 0.06896551724137931
somestring www.xyz.com/Product/1 0.06896551724137931
www.xyz.com/Category/1 www.xyz.com/Product/3 0.5365853658536586
www.xyz.com/Category/1 www.xyz.com/Product/1 0.5853658536585366
www.xyz.com/Product/3 www.xyz.com/Product/1 0.95
[
  [ 'somestring' ],
  [ 'www.xyz.com/Category/1' ],
  [ 'www.xyz.com/Product/3', 'www.xyz.com/Product/1' ]
]

将Node.js

5 个答案: