这是计算余弦相似度的正确方法吗?

时间:2015-02-27 17:41:05

标签: javascript data-mining similarity cosine cosine-similarity

如果你们有人可以请一下,看看下面的方法(伪代码)是否适合计算两个向量之间的余弦相似度:

var vectorA = [2,5,7,8];
var referenceVector= [1,1,1,1];

//Apply weights to vectors (apply positive or negative weights to elements)
var weightageVector = [1,0.5,2,1.5];
var weighted vectA = GetWeightedVector(vectorA);

//normalize each element to a value beteen 0 and 1
//@see http://stn.spotfire.com/spotfire_client_help/norm/norm_scale_between_0_and_1.htm

按此计算:http://jsfiddle.net/snehilw/86jqo1sm/4/

var normalizedVectorA = GetNormalizedVector(vectorA); //using the formula above
var cosineSimilarityScore = GetCosineSimilarityScore(referenceVector, normalizedVectorA );

有人可以告知这是否正确,因为这不能给我正确的结果。

根据要求,以下是代码段:

var defaultVectorWeights = [1,0.5,2,1.5];

var referenceVector = [1, 1, 1, 1] //Default values for the reference vector (Do not change these);

var supportedVectorLength = referenceVector.length;

function getNormalizedVector(multiDimArray, vector){
var normalizedVector = [];

if(vector.length == supportedVectorLength){
    var normalizedValue = 0;

    for(var j = 0; j < supportedVectorLength ; j++){

        var min = getMinMaxForMultidimensionalArrayColumn(multiDimArray,j)[0];
        var max = getMinMaxForMultidimensionalArrayColumn(multiDimArray,j)[1];

        normalizedValue = (max == min) ? 0.5 : (vector[j] - min) / (max - min);
        normalizedVector.push(normalizedValue); 
    }
}
//console.log('normalizedVector='+normalizedVector);
return normalizedVector;
}

function getCosineSimilarityScore(vectorA, vectorB) {

var similarityScore;

if((vectorA.length == supportedVectorLength) && (vectorB.length == supportedVectorLength)){
    var lenVectA = vectorA.length,
        product = 0,      
        normVectorA = 0,
        normVectorB = 0;
    for (var i = 0; i < lenVectA ; i++) {
        product += vectorA[i] * vectorB[i];
        normVectorA += vectorA[i] * vectorA[i];
        normVectorB += vectorB[i] * vectorB[i];
    }

    similarityScore =   product / (Math.sqrt(normVectorA) * Math.sqrt(normVectorB));
}
else {
    //TODO: Handle exception/ Fire an event to notify the server about this exception
    console.log("Cosine similarity workload vectors are of unequal lengths");
}

return similarityScore;

}

function getWeightedVector(vector) {

var vectorArray = []; //Initialize 

if(vector.length == supportedVectorLength){
    for(var j = 0; j < supportedVectorLength ; j++){
        vectorArray.push(defaultVectorWeights[j]*vector[j]);
    }
}
else{
    //TODO: Handle exception/ Fire an event to notify the server about this exception
    console.log("Cosine similarity workload vector is of unsupported length");
}

return vectorArray;
}

function getMinMaxForMultidimensionalArrayColumn(multiDimArray, column){
var _MIN_MAX = []; //[min,max]

var columnarArray = [];

if(column < supportedVectorLength){
    //Extract columnar array from the multi-dimensional array
    $.map(multiDimArray, function( arrayVect) {
        columnarArray.push(arrayVect[column]);
    });
    //Find the MIN and MAX 
    _MIN_MAX.push(Math.min.apply(Math,columnarArray));
    _MIN_MAX.push(Math.max.apply(Math,columnarArray));
}
else{
    //TODO: Handle exception/ Fire an event to notify the server about this exception
    console.log("Cosine similarity workload vectors are of unequal lengths");
}

return _MIN_MAX;
}

function getAssociateWorkloadScore(multiDimArray,queryVector){
var workloadScore;

var weightedQueryVector = [];
var weightedMultiDimArr = [];
var normalizedMultiDimArr = [];
var normalizedQueryVector = [];

//Apply feature scaling
weightedQueryVector = getWeightedVector(queryVector);
weightedMultiDimArr = getWeightedMultiDimArr(multiDimArray);
normalizedQueryVector = getNormalizedVector(weightedMultiDimArr, weightedQueryVector);

workloadScore = getCosineSimilarityScore(referenceVector, normalizedQueryVector);

console.log('weightedQueryVector='+weightedQueryVector);
console.log('weightedMultiDimArr='+JSON.stringify(weightedMultiDimArr));
console.log('normalizedMultiDimArr='+JSON.stringify(normalizedMultiDimArr));
console.log('normalizedQueryVector='+normalizedQueryVector);

console.log('workloadScore='+JSON.stringify(workloadScore));

return workloadScore;
}

function getTeamWorkloadScore(multiDimArray){
var workloadScores = [];

for(var j = 0; j < multiDimArray.length ; j++){
    workloadScores.push(getAssociateWorkloadScore(multiDimArray,multiDimArray[j]));
}
return workloadScores;
}

2 个答案:

答案 0 :(得分:2)

余弦相似度只是一个点积乘以规范的乘积。那么为什么不制作点积函数和范数函数并将结果除以? (来自http://c2.com/cgi/wiki?DotProductInManyProgrammingLanguagesdotproduct

function dotproduct(a,b) {
    var n = 0, lim = Math.min(a.length,b.length);
    for (var i = 0; i < lim; i++) n += a[i] * b[i];
    return n;
 }

function norm2(a) {var sumsqr = 0; for (var i = 0; i < a.length; i++) sumsqr += a[i]*a[i]; return Math.sqrt(sumsqr);}

function similarity(a, b) {return dotproduct(a,b)/norm2(a)/norm2(b);}

现在similarity([1,0,0], [0,1,1]) == 0

答案 1 :(得分:1)

如果你一定需要尺度不变性(即原始余弦相似度),那么使用Gavin代码增加检查零矢量

function cosine_sim(x, y) {
    xnorm = norm2(x);
    if(!xnorm) return 0;
    ynorm = norm2(y);
    if(!ynorm) return 0;
    return dotproduct(x, y) / (xnorm * ynorm);
}

如果您不需要比例不变,只需使用点积(即cosine_sim(x,y)是dotproduct(x,y))。