数据集Node.js中的最近邻居

时间:2013-01-14 21:55:51

标签: node.js mongodb classification knn

我有一个Node.js应用程序,可以将数据保存到MongoDB。 给定一个文档,我想在数据库中找到最相似的文档。

我的想法是实现某种最近邻居算法,该算法将所有记录作为训练序列并返回最相似的文档(包括这两个文档的相似程度的某种百分比。)

E.g。在我的数据库中有这些记录...

{ name: "Bill",   age: 10,  pc: "Mac",      ip: "68.23.13.8" }
{ name: "Alice",  age: 22,  pc: "Windows",  ip: "193.186.11.3" }
{ name: "Bob",    age: 12,  pc: "Windows",  ip: "56.89.22.1" }

...我想找到最接近这个的文件

{ name: "Tom", age: 10, pc: "Mac", ip: "68.23.13.10" }
// algorithm returns "Bill", .76 

是否有任何Node模块/实现可以接受任何类型的对象/参数并返回它们最近的邻居?

2 个答案:

答案 0 :(得分:2)

以下是一些示例代码。它假定您可以对每个请求运行搜索。如果要修改它,请确保所有相似性函数都返回0到1之间的数字。

function tokenize(string) {
  var tokens = [];
  for (var i = 0; i < string.length-1; i++) {
    tokens.push(string.substr(i,2));
  }

  return tokens.sort();
}

function intersect(a, b)
{
  var ai=0, bi=0;
  var result = new Array();

  while( ai < a.length && bi < b.length )
  {
     if      (a[ai] < b[bi] ){ ai++; }
     else if (a[ai] > b[bi] ){ bi++; }
     else /* they're equal */
     {
       result.push(a[ai]);
       ai++;
       bi++;
     }
  }

  return result;
}

function sum(items) {
  var sum = 0;
  for (var i = 0; i < items.length; i++) {
    sum += items[i];
  }

  return sum;
}

function wordSimilarity(a, b) {
  var left   = tokenize(a);
  var right  = tokenize(b);
  var middle = intersect(left, right);

  return (2*middle.length) / (left.length + right.length);
}

function ipSimilarity(a, b) {
  var left  = a.split('.');
  var right = b.split('.');

  var diffs = [];
  for (var i = 0; i < 4; i++) {
    var diff1 = 255-left[i];
    var diff2 = 255-right[i];
    var diff  = Math.abs(diff2-diff1);

    diffs[i] = diff;
  }

  var distance = sum(diffs)/(255*4);

  return 1 - distance;
}

function ageSimilarity(a, b) {
  var maxAge   = 100;
  var diff1    = maxAge-a;
  var diff2    = maxAge-b;
  var diff     = Math.abs(diff2-diff1);
  var distance = diff / maxAge;

  return 1-distance;
}

function recordSimilarity(a, b) {
  var fields = [
    {name:'name', measure:wordSimilarity},
    {name:'age',  measure:ageSimilarity},
    {name:'pc',   measure:wordSimilarity},
    {name:'ip',   measure:ipSimilarity}
  ];

  var sum = 0;
  for (var i = 0; i < fields.length; i++) {
    var field   = fields[i];
    var name    = field.name;
    var measure = field.measure;
    var sim     = measure(a[name], b[name]);

    sum += sim;
  }

  return sum / fields.length;
}

function findMostSimilar(items, query) {
  var maxSim = 0;
  var result = null;

  for (var i = 0; i < items.length; i++) {
    var item = items[i];
    var sim  = recordSimilarity(item, query);

    if (sim > maxSim) {
      maxSim = sim;
      result = item;
    }
  }

  return result
}

var items = [
  { name: "Bill",   age: 10,  pc: "Mac",      ip: "68.23.13.8" },
  { name: "Alice",  age: 22,  pc: "Windows",  ip: "193.186.11.3" },
  { name: "Bob",    age: 12,  pc: "Windows",  ip: "56.89.22.1" }
];

var query  = { name: "Tom", age: 10, pc: "Mac", ip: "68.23.13.10" };
var result = findMostSimilar(items, query);

console.log(result);

答案 1 :(得分:0)

这样做的直接方法是计算两个文档之间的差异,差异越大,距离越大。您可以使用最大可能差异来标准化差异,这可以为您提供可以相互比较的相对距离。

看看这个问题,计算json文档的差异。

Delta encoding for JSON objects