Question

我在python中运行了一个随机的森林模型，并且能够看到分类表。但我希望从python的数据准备，模型运行，模型验证和准确性检查的代码开始，涵盖所有方面的综合代码？我的模型中出现了很多误报。任何改进的帮助也会非常有帮助。

Answer 1

请参阅，

import urllib2
import numpy
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import random
from math import sqrt
import matplotlib.pyplot as plot


# Define function confusion matrix
def confusionMatrix(predicted, actual, threshold):
    if len(predicted) != len(actual): return -1
    tp = 0.0
    fp = 0.0
    tn = 0.0
    fn = 0.0
    for i in range(len(actual)):
        if actual[i] > 0.5: #labels that are 1.0  (positive examples)
            if predicted[i] > threshold:
                tp += 1.0 #correctly predicted positive
            else:
                fn += 1.0 #incorrectly predicted negative
        else:              #labels that are 0.0 (negative examples)
            if predicted[i] < threshold:
                tn += 1.0 #correctly predicted negative
            else:
                fp += 1.0 #incorrectly predicted positive
    rtn = [tp, fn, fp, tn]
    return rtn



#Hyperlink for Python
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")
data = urllib2.urlopen(target_url)

xList = []
labels = []
names = []
firstline = True

for line in data:
    #row strip by "," sign
    row = line.strip().split(",")
    # assign labels as last column
    labels.append(float(row[-1]))
    #remove label from row
    row.pop()
    #feature vector
    floatRow = [float(num) for num in row]
    #append on the xList
    xList.append(floatRow)


nrows = len(xList)
ncols = len(xList[0])

#Split Data for Test and Train
random.seed(1)
nSample = int(nrows * 0.30)
idxTest = random.sample(range(nrows),nSample)
idxTest.sort()
idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)]

xTrain = [xList[r] for r in idxTrain]
xTest = [xList[r] for r in idxTest]
yTrain = [labels[r] for r in idxTrain]
yTest = [labels[r] for r in idxTest]



numTreesMax = 30

treeDepth = 12

nAttr = 4

modelList = []
indexList = []
predList = []
nTrainRows = len(yTrain)


for iTrees in range(numTreesMax):

        idxAttr = random.sample(range(ncols), nAttr)
        idxAttr.sort()
        indexList.append(idxAttr)

        idxRows = []
        for i in range(int(0.5 * nTrainRows)):
                idxRows.append(random.choice(range(len(xTrain))))
        idxRows.sort()

        xRFTrain = []
        yRFTrain = [] 

        for i in range(len(idxRows)):
                temp = [xTrain[idxRows[i]][j] for j in idxAttr]
                xRFTrain.append(temp)
                yRFTrain.append(yTrain[idxRows[i]])

        modelList.append(DecisionTreeClassifier(max_depth = treeDepth))

        modelList[-1].fit(xRFTrain,yRFTrain)

        xRFTest = []
        for xx in xTest:
                temp = [xx[i] for i in idxAttr]
                xRFTest.append(temp)

        latestOutSAmplePrediction = modelList[-1].predict(xRFTest)
        predList.append(list(latestOutSAmplePrediction))



classerror = []
allPredictions = []
for iModels in range(len(modelList)):
        prediction = []
        for iPred in range(len(xTest)):
                prediction.append(sum([predList[i][iPred] for i in range(iModels +1)])/(iModels +1))

        allPredictions.append(prediction)
        conMatTest = confusionMatrix(prediction,yTest,0.5)
        errors = 1.0 - ((conMatTest[0] + conMatTest[3])/(conMatTest[0]+conMatTest[1]+conMatTest[2]+conMatTest[3]))
        classerror.append(errors)





nModels = [i + 1 for i in range(len(modelList))]

plot.plot(nModels,classerror)
plot.axis('tight')
plot.xlabel('Number of Trees in Ensamble')
plot.ylabel('Class Error')
plot.ylim((0.0,max(classerror)))
plot.show()

python中的随机森林

1 个答案: