我在分类技术上找到了两个在线代码。一种技术是Naive Bayes,另一种是KNn。我使用了两个数据集:一个是iris.data,另一个是prima-indians-diabetes.data。
prima indians数据集在Naive Bayes算法中正常工作,Iris.data在KNn算法中正常工作。但我想比较两种算法,这些算法只有在两种算法中运行一个数据集时才有可能。
我将Naive bayes和KNn的算法与两个数据集相关联。以及相应的追溯。
Naive Bayes with iris.data
# Example of Naive Bayes implemented from Scratch in Python
import csv
import random
import math
def loadCsv(filename):
lines = csv.reader(open(filename, "rt"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
def mean(numbers):
return sum(numbers) / float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(testSet))) * 100.0
def main():
filename = 'E:\iris.data.csv'
splitRatio = 0.67
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print(('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet)))
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print(('Accuracy: {0}%').format(accuracy))
main()
,其回溯是:
runfile(&#39; C:/ Users / Lenovo / Desktop / EE Codes / Knn with prima.py&#39;, wdir =&#39; C:/ Users / Lenovo / Desktop / EE Codes&#39;)回溯(最近一次通话) 最后):
文件 &#34; C:\ Users \用户联想\ Anaconda3 \ lib中\站点包\ IPython的\芯\ interactiveshell.py&#34 ;, 第2862行,在run_code中 exec(code_obj,self.user_global_ns,self.user_ns)
文件&#34;&#34;,第1行,in runfile(&#39; C:/ Users / Lenovo / Desktop / EE Codes / Knn with prima.py&#39;,wdir =&#39; C:/ Users / Lenovo / Desktop / EE Codes&#39;)
文件 &#34; C:\ Users \用户联想\ Anaconda3 \ lib中\站点包\ spyder的\ utils的\站点\ sitecustomize.py&#34 ;, 第710行,在runfile中 execfile(filename,namespace)
文件 &#34; C:\ Users \用户联想\ Anaconda3 \ lib中\站点包\ spyder的\ utils的\站点\ sitecustomize.py&#34 ;, 第101行,在execfile中 exec(compile(f.read(),filename,&#39; exec&#39;),命名空间)
文件&#34; C:/ Users / Lenovo / Desktop / EE Codes / knn with prima.py&#34;,第63行 打印&#39;火车套装:&#39; + repr(len(trainingSet)) ^ SyntaxError:语法无效
与主要印第安人有关:
# Example of kNN implemented from Scratch in Python
import csv
import random
import math
import operator
def loadDataset(filename, split, trainingSet=[] , testSet=[]):
with open(filename, 'rt') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
for x in range(len(dataset)-1):
for y in range(4):
dataset[x][y] = float(dataset[x][y])
if random.random() < split:
trainingSet.append(dataset[x])
else:
testSet.append(dataset[x])
def euclideanDistance(instance1, instance2, length):
distance = 0
for x in range(length):
distance += pow((instance1[x] - instance2[x]), 2)
return math.sqrt(distance)
def getNeighbors(trainingSet, testInstance, k):
distances = []
length = len(testInstance)-1
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet[x], length)
distances.append((trainingSet[x], dist))
distances.sort(key=operator.itemgetter(1))
neighbors = []
for x in range(k):
neighbors.append(distances[x][0])
return neighbors
def getResponse(neighbors):
classVotes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedVotes[0][0]
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
def main():
# prepare data
trainingSet=[]
testSet=[]
split = 0.67
loadDataset('E:\pima-indians-diabetes.data.csv', split, trainingSet, testSet)
print 'Train set: ' + repr(len(trainingSet))
print 'Test set: ' + repr(len(testSet))
# generate predictions
predictions=[]
k = 3
for x in range(len(testSet)):
neighbors = getNeighbors(trainingSet, testSet[x], k)
result = getResponse(neighbors)
predictions.append(result)
print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(accuracy) + '%')
main()
追溯是:
runfile(&#39; C:/ Users / Lenovo / Desktop / EE Codes / Knn with prima.py&#39;, wdir =&#39; C:/ Users / Lenovo / Desktop / EE Codes&#39;)回溯(最近一次通话) 最后):
文件 &#34; C:\ Users \用户联想\ Anaconda3 \ lib中\站点包\ IPython的\芯\ interactiveshell.py&#34 ;, 第2862行,在run_code中 exec(code_obj,self.user_global_ns,self.user_ns)
文件&#34;&#34;,第1行,in runfile(&#39; C:/ Users / Lenovo / Desktop / EE Codes / Knn with prima.py&#39;,wdir =&#39; C:/ Users / Lenovo / Desktop / EE Codes&#39;)
文件 &#34; C:\ Users \用户联想\ Anaconda3 \ lib中\站点包\ spyder的\ utils的\站点\ sitecustomize.py&#34 ;, 第710行,在runfile中 execfile(filename,namespace)
文件 &#34; C:\ Users \用户联想\ Anaconda3 \ lib中\站点包\ spyder的\ utils的\站点\ sitecustomize.py&#34 ;, 第101行,在execfile中 exec(compile(f.read(),filename,&#39; exec&#39;),命名空间)
文件&#34; C:/ Users / Lenovo / Desktop / EE Codes / knn with prima.py&#34;,第63行 打印&#39;火车套装:&#39; + repr(len(trainingSet)) ^ SyntaxError:语法无效
这两位代码有什么问题?