无误的m折交叉验证准确性计算错误

时间:2019-04-17 21:31:44

标签: python-3.x machine-learning cross-validation fold nearest-neighbor

我正在使用自己的自定义k最近邻函数进行m折交叉验证,不仅对于输入的训练和测试集,我的准确度值很少更改,而且总体准确度非常差。我正在“ fglass.dat”数据集上测试我的函数,其中“ fglass.grp”文档代表用来定义应该删除的列(m折)并用作KNN的测试集。虽然我相信负责此测试列提取的“ createFolds”函数可以正常工作,但我不知道如何在我的KNN函数中的代码中确定正确的类类型。我应该如何确定所有214行的“ glass.dat”类类型? “ fragArray”是“ fglass.dat”数据,而“ iterArray”是“ fglass.grp”数据。

使用#Debugging输出作为基础,几乎总是在我的欧几里得距离方程中找到相同的几个trainSample索引,导致每次运行的“ typeIndex”始终相同。与所有其他类类型相比,这导致精度循环计算仅适用于该类类型(例如:如果所有testSamples的typeIndex = 5,则所有1到7的不是5的索引都是错误的),从而导致同样的精度差。因此,据我所知,问题是“ typeIndex”永远不会改变。

def EuclidCalc(data1, data2, length):

    distance = 0

    for point in range(length):
        #for this project, data1 (test data) has only 1 element here
        distance += math.pow(data1 - data2[point], 2)
    distance = math.sqrt(distance)

    return distance

def KNNPredict(trainFold, testFold, classInfo, k):

    fragTypes = [] #array to store all the calculated fragment types

    totalCorrect = 0 #used for accuracy

    runCount = 0

    #for all test samples in the test fold
    for testSample in range(len(testFold)):

        #Attempt to calculate the correct fragment type within this for loop

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

        # create distance and (k) nearest neighbors arrays
        distances = []
        nearestNeighbors = [] #length = k

        for trainSample in range(len(trainFold)):
            #find the Euclidean distance between samples

            distance = EuclidCalc(testFold[testSample], trainFold[trainSample], len(trainFold[trainSample]))

            distances.append([distance, trainSample]) #record the current sample with its corresponding distance

        #sort distances array in ascending order
        distances = sorted(distances)

        if(runCount < 1):
            print("3 closest distances: ", distances[0:3])

        #find the (k) nearest neighbors in the trainFold array
        for kNeighbor in range(k):
            neighborIndex = distances[kNeighbor][1]

            nearestNeighbors.append(classInfo[neighborIndex]) #record the found nearest neighbor type

        if(runCount < 1):
            print("neigbhors: ", nearestNeighbors)

        classVotes = {} #create dictionary for the votes, this is more flexible for the number of class types

        for neighbor in range(len(nearestNeighbors)):
            vote = nearestNeighbors[neighbor] #return classType

            #either add vote to existing collection of the same class type,
            #or create a new class vote type
            if vote in classVotes: #if class type votes were already done
                classVotes[vote] = classVotes[vote] + 1 #add another vote to the same class type
            else:
                classVotes[vote] = 1 #create a new tally of votes for this class type

        if(runCount < 1):
            print("all classes with their number of votes: ", classVotes)

        typeIndex = max(classVotes.keys(), key=(lambda keyVal: classVotes[keyVal])) #determine which type got the most votes

        if(runCount < 1):
            print("chosen type: ", typeIndex)

            input("Press Enter to continue")

        #record calculated fragment type
        fragTypes.append(typeIndex)

        runCount = runCount + 1

    for fragment in range(len(fragTypes)):
        correctType = int(classInfo[fragment]) #find the correct type for each test fold fragment

        #print("correct type: ", correctType)
        #print("found type: ", fragTypes[fragment])

        #input("Press Enter to continue")

        if (int(fragTypes[fragment]) == correctType):
            totalCorrect = totalCorrect + 1

    accuracy = float(totalCorrect/float(len(classInfo))) #convert accuracy to a float between 0 and 1

    return accuracy

def createFolds(fragArray, iterArray):

    iterNum = 1 #iteration number in iterArray
    randomClass = 0 #the type class associated with the iteration

    extractArray = fragArray #use this array to remove the test column

    folds = [] #store all train and test folds here

    #while all iteration types are still being found
    while (iterNum < len(iterArray) + 1):

        #for all iterations
        for iterationVal in range(len(iterArray)):
            if(int(iterArray[iterationVal][1]) == iterNum): #if the iterations match
                randomClass = iterArray[iterationVal][0] #get the "random" class column index associated with the iteration

        testSet = [row[randomClass - 1] for row in fragArray] #separate the test class column from fragArray

        trainSet = np.delete(extractArray, (randomClass - 1), 1) #remove the test class column from extractArray
        trainSet = trainSet.tolist()

        folds.append([trainSet, testSet]) #save training and test arrays as a fold

        extractArray = fragArray #reset extract array
        iterNum = iterNum + 1

    return folds

def KNNCrossCalc(fragArray, iterArray, k):

    classInfo = [row[len(fragArray[0]) - 1] for row in fragArray]

    #return training and test dataset folds as a single total fold
    folds = createFolds(fragArray, iterArray)

    #print(np.array(indexArray).shape)

    #To get the training set of each fold: folds[fold of choice][0]
    #To get the test set of each fold: folds[fold of choice][1]

    #DEBUGGING!!!!!!!!!!!!!!!
    #print(np.array(folds[0][0]).shape) = (214, 9)
    #print(np.array(folds[0][1]).shape) = (214, )

    totalAccuracy = 0

    for foldNum in range(len(folds)): #for each collection of folds

        #find the accuracy for each fold using KNN
        accuracy = KNNPredict(folds[foldNum][0], folds[foldNum][1], classInfo, k)
        #print("Fold ", (foldNum + 1), " accuracy: ", (accuracy * 100), "%")

        #accumulate all accuracy calculations in totalAccuracy
        totalAccuracy = totalAccuracy + accuracy

    #calculate the average accuracy for this run for all folds
    averageAccuracy = float(totalAccuracy / len(folds))
    print("\nAverage accuracy for all folds: ", (averageAccuracy* 100), "%")

0 个答案:

没有答案