Question

我可以使用数百个JSON字符串。这些中的每一个都包含按某个预定权重排序的15-20个单词的数组。如果值得注意的话，这个重量是在一些文本中找到这些单词的次数。找到像这样结构化的单词数组之间找到相似性的最佳方法是什么？

我想到的第一个想法是创建所有单词的数字散列，并基本比较这些值以确定相似性。我对此并不是很成功，因为非常相似的字符串的结果哈希值不是非常接近。在对字符串比较算法进行一些研究之后，我来到Stackoverflow希望获得更多指导。在此先感谢，如果您需要更多详细信息，请告诉我。

编辑1：澄清我正在尝试做的事情：我想确定两个数组的相似程度。我还想考虑每个单词在每个数组中携带的重量。例如：

var array1 = [{"word":"hill","count":5},{"word":"head","count":5}];
var array2 = [{"word":"valley","count":7},{"word":"head","count":5}];
var array3 = [{"word":"head", "count": 6}, {"word": "valley", "count": 5}];
var array4 = [{"word": "valley", "count": 7}, {"word":"head", "count": 5}];

在该示例中，数组4和数组2比数组2和数组3更相似，因为即使两者具有相同的单词，数组4和2中的权重也相同。我希望这样做它有点容易理解。提前谢谢。

Answer 1

我认为您想要的是“cosine similarity”，您可能还想查看vector space models。如果您使用Java编写代码，则可以使用开源S-space包。

（10月31日添加）向量的每个元素都是一个特定字符串的计数。您只需将字符串数组转换为此类向量。在你的例子中，你有三个词 - “山”，“头”，“山谷”。如果矢量按此顺序排列，则对应于数组的矢量将为

// array: #hill, #head, #valley
array1:  {5,     5,     0}
array2:  {0,     5,     7}
array3:  {0,     6,     5}
array4:  {0,     5,     7}

Answer 2

鉴于必须将每个阵列与每个其他阵列进行比较，您将看到沿着每个阵列中“字”的平均数量的Σ（n-1）行的大量处理。你需要存储每个比较的分数，然后对它有所了解。

e.g。

var array1 = [{"word":"hill","count":5},{"word":"head","count":5}];
var array2 = [{"word":"valley","count":7},{"word":"head","count":5}];
var array3 = [{"word":"head", "count": 6}, {"word": "valley", "count": 5}];
var array4 = [{"word": "valley", "count": 7}, {"word":"head", "count": 5}];

// Comparison score is summed product of matching word counts
function compareThings() {

  var a, b, i = arguments.length,
      j, m, mLen, n, nLen;
  var word, score, result = [];

  if (i < 2) return;

  // For each array
  while (i--) {
    a = arguments[i];
    j = i;

    // Compare with every other array
    while (j--) {
      b = arguments[j];
      score = 0;

      // For each word in array
      for (m=0, mLen = b.length; m<mLen; m++) {
        word = b[m].word

        // Compare with each word in other array
        for (n=0, nLen=a.length; n<nLen; n++) {

          // Add to score
          if (a[n].word == word) {
            score += a[n].count * b[m].count;
          }
        }
      }

      // Put score in result
      result.push(i + '-' + j + ':' + score);
    }
  }
  return result;
}

var results = compareThings(array1, array2, array3, array4);

alert('Raw results:\n' + results.join('\n'));
/*
Raw results:
3-2:65
3-1:74
3-0:25
2-1:65
2-0:30
1-0:25
*/

results.sort(function(a, b) {
  a = a.split(':')[1];
  b = b.split(':')[1];
  return b - a;
});

alert('Sorted results:\n' + results.join('\n'));
/*
Sorted results:
3-1:74
3-2:65
2-1:65
2-0:30
3-0:25
1-0:25
*/

所以3-1（array4和array2）得分最高。幸运的是，比较只需要一种方式，您不必将a与b和b进行比较。

Answer 3

这是一次尝试。该算法不是很聪明（差异＆gt; 20与没有相同的单词相同），但可能是一个有用的开始：

var wordArrays = [
    [{"word":"hill","count":5},{"word":"head","count":5}]
  , [{"word":"valley","count":7},{"word":"head","count":5}]
  , [{"word":"head", "count": 6}, {"word": "valley", "count": 5}]
  , [{"word": "valley", "count": 7}, {"word":"head", "count": 5}]
]

function getSimilarTo(index){
    var src = wordArrays[index]
      , values

    if (!src) return null;

    // compare with other arrays
    weighted = wordArrays.map(function(arr, i){
        var diff = 0
        src.forEach(function(item){
            arr.forEach(function(other){
                if (other.word === item.word){
                    // add the absolute distance in count
                    diff += Math.abs(item.count - other.count)
                } else {
                    // mismatches
                    diff += 20
                }
            })
        })
        return {
            arr   : JSON.stringify(arr)
          , index : i
          , diff  : diff
        }
    })

    return weighted.sort(function(a,b){
        if (a.diff > b.diff) return 1
        if (a.diff < b.diff) return -1
        return 0
    })
}

/*
getSimilarTo(3)
[ { arr: '[{"word":"valley","count":7},{"word":"head","count":5}]',
    index: 1,
    diff: 100 },
  { arr: '[{"word":"valley","count":7},{"word":"head","count":5}]',
    index: 3,
    diff: 100 },
  { arr: '[{"word":"head","count":6},{"word":"valley","count":5}]',
    index: 2,
    diff: 103 },
  { arr: '[{"word":"hill","count":5},{"word":"head","count":5}]',
    index: 0,
    diff: 150 } ]
*/

Answer 4

在尝试比较之前，按字对数组进行排序。一旦完成，比较两个数组将需要通过每个数组1次。

对数组进行排序后，这是一个比较算法（psuedo-java）：


int compare(array1, array2)
{
  returnValue = 0;
  array1Index = 0
  array2Index = 0;

  while (array1Index < array1.length)
  {
    if (array2Index < array2.length)
    {
      if (array1[array1Index].word == array2[array2Index].word) // words match.
      {
        returnValue += abs(array1[array1Index].count - array2[array2Index].count);
        ++array1Index;
        ++array2Index;
      }
      else // account for the unmatched array2 word.
      {
        // 100 is just a number to give xtra weight to unmatched numbers.
        returnValue += 100 + array2[array2Index].count;
        ++array2Index;
      }
    }
    else // array2 empty and array1 is not empty.
    {
      // 100 is just a number to give xtra weight to unmatched numbers.
      returnValue += 100 + array1[array1Index].count;
    }
  }

  // account for any extra unmatched array 2 values.
  while (array2Index < array2.length)
  {
      // 100 is just a number to give xtra weight to unmatched numbers.
      returnValue += 100 + array2[array2Index].count;
  }

  return returnValue;
}

比较字符串数组的相似性

4 个答案: