如果你们有人可以请一下,看看下面的方法(伪代码)是否适合计算两个向量之间的余弦相似度:
var vectorA = [2,5,7,8];
var referenceVector= [1,1,1,1];
//Apply weights to vectors (apply positive or negative weights to elements)
var weightageVector = [1,0.5,2,1.5];
var weighted vectA = GetWeightedVector(vectorA);
//normalize each element to a value beteen 0 and 1
//@see http://stn.spotfire.com/spotfire_client_help/norm/norm_scale_between_0_and_1.htm
按此计算:http://jsfiddle.net/snehilw/86jqo1sm/4/
var normalizedVectorA = GetNormalizedVector(vectorA); //using the formula above
var cosineSimilarityScore = GetCosineSimilarityScore(referenceVector, normalizedVectorA );
有人可以告知这是否正确,因为这不能给我正确的结果。
根据要求,以下是代码段:
var defaultVectorWeights = [1,0.5,2,1.5];
var referenceVector = [1, 1, 1, 1] //Default values for the reference vector (Do not change these);
var supportedVectorLength = referenceVector.length;
function getNormalizedVector(multiDimArray, vector){
var normalizedVector = [];
if(vector.length == supportedVectorLength){
var normalizedValue = 0;
for(var j = 0; j < supportedVectorLength ; j++){
var min = getMinMaxForMultidimensionalArrayColumn(multiDimArray,j)[0];
var max = getMinMaxForMultidimensionalArrayColumn(multiDimArray,j)[1];
normalizedValue = (max == min) ? 0.5 : (vector[j] - min) / (max - min);
normalizedVector.push(normalizedValue);
}
}
//console.log('normalizedVector='+normalizedVector);
return normalizedVector;
}
function getCosineSimilarityScore(vectorA, vectorB) {
var similarityScore;
if((vectorA.length == supportedVectorLength) && (vectorB.length == supportedVectorLength)){
var lenVectA = vectorA.length,
product = 0,
normVectorA = 0,
normVectorB = 0;
for (var i = 0; i < lenVectA ; i++) {
product += vectorA[i] * vectorB[i];
normVectorA += vectorA[i] * vectorA[i];
normVectorB += vectorB[i] * vectorB[i];
}
similarityScore = product / (Math.sqrt(normVectorA) * Math.sqrt(normVectorB));
}
else {
//TODO: Handle exception/ Fire an event to notify the server about this exception
console.log("Cosine similarity workload vectors are of unequal lengths");
}
return similarityScore;
}
function getWeightedVector(vector) {
var vectorArray = []; //Initialize
if(vector.length == supportedVectorLength){
for(var j = 0; j < supportedVectorLength ; j++){
vectorArray.push(defaultVectorWeights[j]*vector[j]);
}
}
else{
//TODO: Handle exception/ Fire an event to notify the server about this exception
console.log("Cosine similarity workload vector is of unsupported length");
}
return vectorArray;
}
function getMinMaxForMultidimensionalArrayColumn(multiDimArray, column){
var _MIN_MAX = []; //[min,max]
var columnarArray = [];
if(column < supportedVectorLength){
//Extract columnar array from the multi-dimensional array
$.map(multiDimArray, function( arrayVect) {
columnarArray.push(arrayVect[column]);
});
//Find the MIN and MAX
_MIN_MAX.push(Math.min.apply(Math,columnarArray));
_MIN_MAX.push(Math.max.apply(Math,columnarArray));
}
else{
//TODO: Handle exception/ Fire an event to notify the server about this exception
console.log("Cosine similarity workload vectors are of unequal lengths");
}
return _MIN_MAX;
}
function getAssociateWorkloadScore(multiDimArray,queryVector){
var workloadScore;
var weightedQueryVector = [];
var weightedMultiDimArr = [];
var normalizedMultiDimArr = [];
var normalizedQueryVector = [];
//Apply feature scaling
weightedQueryVector = getWeightedVector(queryVector);
weightedMultiDimArr = getWeightedMultiDimArr(multiDimArray);
normalizedQueryVector = getNormalizedVector(weightedMultiDimArr, weightedQueryVector);
workloadScore = getCosineSimilarityScore(referenceVector, normalizedQueryVector);
console.log('weightedQueryVector='+weightedQueryVector);
console.log('weightedMultiDimArr='+JSON.stringify(weightedMultiDimArr));
console.log('normalizedMultiDimArr='+JSON.stringify(normalizedMultiDimArr));
console.log('normalizedQueryVector='+normalizedQueryVector);
console.log('workloadScore='+JSON.stringify(workloadScore));
return workloadScore;
}
function getTeamWorkloadScore(multiDimArray){
var workloadScores = [];
for(var j = 0; j < multiDimArray.length ; j++){
workloadScores.push(getAssociateWorkloadScore(multiDimArray,multiDimArray[j]));
}
return workloadScores;
}
答案 0 :(得分:2)
余弦相似度只是一个点积乘以规范的乘积。那么为什么不制作点积函数和范数函数并将结果除以? (来自http://c2.com/cgi/wiki?DotProductInManyProgrammingLanguages的dotproduct
)
function dotproduct(a,b) {
var n = 0, lim = Math.min(a.length,b.length);
for (var i = 0; i < lim; i++) n += a[i] * b[i];
return n;
}
function norm2(a) {var sumsqr = 0; for (var i = 0; i < a.length; i++) sumsqr += a[i]*a[i]; return Math.sqrt(sumsqr);}
function similarity(a, b) {return dotproduct(a,b)/norm2(a)/norm2(b);}
现在similarity([1,0,0], [0,1,1])
== 0
答案 1 :(得分:1)
如果你一定需要尺度不变性(即原始余弦相似度),那么使用Gavin代码增加检查零矢量
function cosine_sim(x, y) {
xnorm = norm2(x);
if(!xnorm) return 0;
ynorm = norm2(y);
if(!ynorm) return 0;
return dotproduct(x, y) / (xnorm * ynorm);
}
如果您不需要比例不变,只需使用点积(即cosine_sim(x,y)是dotproduct(x,y))。