我必须计算不同文档和查询之间的余弦相似性度量。
private static double[,] getContent()
{
List<List<string>> documents = new List<List<string>>();
string query = "access control policies game";
string document1 = "The game of life. is a game of, everlasting learning";
char[] separator = new char[] {' ', '.', ',', ':', ';'};
var splitedQuery = query.Split(separator).ToList();
var splitedDocument1 = document1.Split(separator).ToList();
var unicalWords_D1 = (query + " " + document1).Split(separator).GroupBy(g => g).Select(s => s.Key).ToArray();
documents.Add(splitedQuery);
documents.Add(splitedDocument1);
var array = GetWeights(documents, unicalWords_D1);
return array;
}
public static double[,] GetWeights(List<List<string>> splitedDocuments, string[] unicalWords)
{
double[,] matrix = new double[unicalWords.Count(), splitedDocuments.Count];
// some processing for term frequency (tf) and inverse term frequency (idf)
return matrix[j, i] = weight;
}
private static double CalculateCosineSimilarity(double[] vecA, double[] vecB)
{
var dotProduct = DotProduct(vecA, vecB);
var magnitudeOfA = Magnitude(vecA);
var magnitudeOfB = Magnitude(vecB);
return dotProduct / (magnitudeOfA * magnitudeOfB);
}
private static double DotProduct(double[] vecA, double[] vecB)
{
double dotProduct = 0;
for (var i = 0; i < vecA.Length; i++)
{
dotProduct += (vecA[i] * vecB[i]);
}
return dotProduct;
}
private static double Magnitude(double[] vector)
{
return Math.Sqrt(DotProduct(vector, vector));
}
所以,所有方法都存在,这里是类的主要方法。
static void Main(string[] args)
{
var result = getContent();
var length = result.GetLength(0);
double[] doc1Array = new double[length];
double[] doc2Array = new double[length];
//first doc
for (int i = 0; i < length; i++)
{
doc1Array[i] = result[i, 0];
}
//second doc
for (int i = 0; i < length; i++)
{
doc2Array[i] = result[i, 1];
}
var cosSimilarity = CalculateCosineSimilarity(doc1Array, doc2Array);
Console.WriteLine("Similarity between Query and Doc1: ");
Console.WriteLine(cosSimilarity);
Console.ReadKey();
}
这一切都计算了查询和document1之间的相似性。我必须计算不同查询和文档之间的余弦相似度,存储在数组中并按降序排序。我怎样才能完成这些任务。
建议得到高度赞赏。感谢