
时间:2014-07-07 17:33:05

标签: c# optimization classification knn



// testSamples and trainSamples consists of about 20k vectors each with 25 dimensions
// trainClasses contains 0 or 1 signifying the corresponding class for each sample in trainSamples
static int[] TestKnnCase(IList<double[]> trainSamples, IList<double[]> testSamples, IList<int[]> trainClasses, int K)
    Console.WriteLine("Performing KNN with K = "+K);

    var testResults = new int[testSamples.Count()]; 

    var testNumber = testSamples.Count();
    var trainNumber = trainSamples.Count();
    // Declaring these here so that I don't have to 'new' them over and over again in the main loop, 
    // just to save some overhead
    var distances = new double[trainNumber][]; 
    for (var i = 0; i < trainNumber; i++)
       distances[i] = new double[2]; // Will store both distance and index in here

    // Performing KNN ...
    for (var tst = 0; tst < testNumber; tst++)
        // For every test sample, calculate distance from every training sample
        Parallel.For(0, trainNumber, trn =>
            var dist = GetDistance(testSamples[tst], trainSamples[trn]);
            // Storing distance as well as index 
            distances[trn][0] = dist;
            distances[trn][1] = trn;

        // Sort distances and take top K (?What happens in case of multiple points at the same distance?)
        var votingDistances = distances.AsParallel().OrderBy(t => t[0]).Take(K);

        // Do a 'majority vote' to classify test sample
        var yea = 0.0;
        var nay = 0.0;

        foreach (var voter in votingDistances)
            if (trainClasses[(int)voter[1]] == 1)  
        if (yea > nay)
            testResults[tst] = 1;
            testResults[tst] = 0;


    return testResults;

// Calculates and returns square of Euclidean distance between two vectors
static double GetDistance(IList<double> sample1, IList<double> sample2)
    var distance = 0.0;
    // assume sample1 and sample2 are valid i.e. same length 

    for (var i = 0; i < sample1.Count; i++)
        var temp = sample1[i] - sample2[i];
        distance += temp * temp;
    return distance;



我也发现了this stackoverflow discussion这个问题,但看起来这已经有3年之久了,我希望到现在为止有人会知道更好的解决方法。


编辑添加 - 我无法使用PCA或其他东西减少维度。对于此特定型号,需要25个尺寸。

1 个答案:

答案 0 :(得分:3)

每当您尝试提高代码性能时,第一步是分析当前性能以确切了解它花费时间的位置。一个好的剖析器对此至关重要。在我以前的工作中,我能够使用dotTrace profiler取得良好效果; Visual Studio还有一个built-in profiler。一个好的分析器会告诉你你的代码在哪里花费时间逐个方法甚至逐行。


  1. 您正在并行化一些内部循环。你可以并行化外环吗?代理调用(请参阅herehere)可能会在“Parallel.For”回调中触及您,但代价很小但非零成本。

  2. 类似地,使用IList接口索引数组会有很小的性能损失。您可以考虑明确地将数组参数声明为“GetDistance()”。

  3. 与训练阵列的大小相比,K有多大?您完全对“距离”数组进行排序并获取顶部K,但如果K远小于数组大小,则使用partial sort / selection算法可能是有意义的,例如使用SortedSet并在设置大小超过K时替换最小元素。