我在python中运行了一个随机的森林模型,并且能够看到分类表。但我希望从python的数据准备,模型运行,模型验证和准确性检查的代码开始,涵盖所有方面的综合代码? 我的模型中出现了很多误报。任何改进的帮助也会非常有帮助。
答案 0 :(得分:2)
请参阅,
import urllib2
import numpy
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import random
from math import sqrt
import matplotlib.pyplot as plot
# Define function confusion matrix
def confusionMatrix(predicted, actual, threshold):
if len(predicted) != len(actual): return -1
tp = 0.0
fp = 0.0
tn = 0.0
fn = 0.0
for i in range(len(actual)):
if actual[i] > 0.5: #labels that are 1.0 (positive examples)
if predicted[i] > threshold:
tp += 1.0 #correctly predicted positive
else:
fn += 1.0 #incorrectly predicted negative
else: #labels that are 0.0 (negative examples)
if predicted[i] < threshold:
tn += 1.0 #correctly predicted negative
else:
fp += 1.0 #incorrectly predicted positive
rtn = [tp, fn, fp, tn]
return rtn
#Hyperlink for Python
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")
data = urllib2.urlopen(target_url)
xList = []
labels = []
names = []
firstline = True
for line in data:
#row strip by "," sign
row = line.strip().split(",")
# assign labels as last column
labels.append(float(row[-1]))
#remove label from row
row.pop()
#feature vector
floatRow = [float(num) for num in row]
#append on the xList
xList.append(floatRow)
nrows = len(xList)
ncols = len(xList[0])
#Split Data for Test and Train
random.seed(1)
nSample = int(nrows * 0.30)
idxTest = random.sample(range(nrows),nSample)
idxTest.sort()
idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)]
xTrain = [xList[r] for r in idxTrain]
xTest = [xList[r] for r in idxTest]
yTrain = [labels[r] for r in idxTrain]
yTest = [labels[r] for r in idxTest]
numTreesMax = 30
treeDepth = 12
nAttr = 4
modelList = []
indexList = []
predList = []
nTrainRows = len(yTrain)
for iTrees in range(numTreesMax):
idxAttr = random.sample(range(ncols), nAttr)
idxAttr.sort()
indexList.append(idxAttr)
idxRows = []
for i in range(int(0.5 * nTrainRows)):
idxRows.append(random.choice(range(len(xTrain))))
idxRows.sort()
xRFTrain = []
yRFTrain = []
for i in range(len(idxRows)):
temp = [xTrain[idxRows[i]][j] for j in idxAttr]
xRFTrain.append(temp)
yRFTrain.append(yTrain[idxRows[i]])
modelList.append(DecisionTreeClassifier(max_depth = treeDepth))
modelList[-1].fit(xRFTrain,yRFTrain)
xRFTest = []
for xx in xTest:
temp = [xx[i] for i in idxAttr]
xRFTest.append(temp)
latestOutSAmplePrediction = modelList[-1].predict(xRFTest)
predList.append(list(latestOutSAmplePrediction))
classerror = []
allPredictions = []
for iModels in range(len(modelList)):
prediction = []
for iPred in range(len(xTest)):
prediction.append(sum([predList[i][iPred] for i in range(iModels +1)])/(iModels +1))
allPredictions.append(prediction)
conMatTest = confusionMatrix(prediction,yTest,0.5)
errors = 1.0 - ((conMatTest[0] + conMatTest[3])/(conMatTest[0]+conMatTest[1]+conMatTest[2]+conMatTest[3]))
classerror.append(errors)
nModels = [i + 1 for i in range(len(modelList))]
plot.plot(nModels,classerror)
plot.axis('tight')
plot.xlabel('Number of Trees in Ensamble')
plot.ylabel('Class Error')
plot.ylim((0.0,max(classerror)))
plot.show()