余弦相似度量c#

时间:2015-06-23 13:33:19

标签: c# document cosine-similarity

我必须计算不同文档和查询之间的余弦相似性度量。

private static double[,] getContent()
{
    List<List<string>> documents = new List<List<string>>();

    string query = "access control policies game";
    string document1 = "The game of life. is a game of, everlasting learning";

    char[] separator = new char[] {' ', '.', ',', ':', ';'};
    var splitedQuery = query.Split(separator).ToList();
    var splitedDocument1 = document1.Split(separator).ToList();

    var unicalWords_D1 = (query + " " + document1).Split(separator).GroupBy(g => g).Select(s => s.Key).ToArray();

    documents.Add(splitedQuery);
    documents.Add(splitedDocument1);   

    var array = GetWeights(documents, unicalWords_D1);
    return array; 
}

public static double[,] GetWeights(List<List<string>> splitedDocuments, string[] unicalWords)
{
    double[,] matrix = new double[unicalWords.Count(), splitedDocuments.Count];
    // some processing for term frequency (tf) and inverse term frequency (idf)
    return matrix[j, i] = weight;
}

private static double CalculateCosineSimilarity(double[] vecA, double[] vecB)
{
    var dotProduct = DotProduct(vecA, vecB);
    var magnitudeOfA = Magnitude(vecA);
    var magnitudeOfB = Magnitude(vecB);

    return dotProduct / (magnitudeOfA * magnitudeOfB);
}
private static double DotProduct(double[] vecA, double[] vecB)
{
    double dotProduct = 0;
    for (var i = 0; i < vecA.Length; i++)
    {
        dotProduct += (vecA[i] * vecB[i]);
    }
    return dotProduct;
}
private static double Magnitude(double[] vector)
{
    return Math.Sqrt(DotProduct(vector, vector));
}

所以,所有方法都存在,这里是类的主要方法。

static void Main(string[] args)
{
    var result = getContent();
    var length = result.GetLength(0);
    double[] doc1Array = new double[length];
    double[] doc2Array = new double[length];

    //first doc
    for (int i = 0; i < length; i++)
    {
        doc1Array[i] = result[i, 0];
    }

    //second doc
    for (int i = 0; i < length; i++)
    {
        doc2Array[i] = result[i, 1];
    }

    var cosSimilarity = CalculateCosineSimilarity(doc1Array, doc2Array);

    Console.WriteLine("Similarity between Query and Doc1: ");
    Console.WriteLine(cosSimilarity);
    Console.ReadKey();
}

这一切都计算了查询和document1之间的相似性。我必须计算不同查询和文档之间的余弦相似度,存储在数组中并按降序排序。我怎样才能完成这些任务。

建议得到高度赞赏。感谢

0 个答案:

没有答案