我试图为机器学习模型编写分类器算法,但是它带有错误。有人可以帮忙吗?预先感谢
import pandas as pd
from sklearn.metrics import accuracy_score
from scipy.spatial import distance
def euc(a, b):
return distance.euclidean(a,b)
class classifierKN():
def fit(self, X_train, Y_train):
self.X_train = X_train
self.Y_train = Y_train
def predict(self, X_test):
predictions = []
for row in X_test:
label = self.closest(row)
predictions.append(label)
return predictions
def closest(self, row):
best_dist = euc(row, self.X_train[0])
best_index = 0
for i in range(1, len(self.X_train)):
dist = euc(row, self.X_train[i])
if dist < best_dist:
best_dist = dist
best_index = i
return self.Y_train[best_index]
#Load the dataset
diabetdata = pd.read_csv("diabetes.csv")
#set features and target
features = ["PlasmaGlucose", "DiastolicBloodPressure", "TricepsThickness", "SerumInsulin"]
X = diabetdata[features]
print("FEATURES: " , X.head())
Y = diabetdata.Diabetic
print("TARGET: " , Y.head())
print("")
from sklearn.model_selection import train_test_split #No module named 'sklearn.cross_validation' so I replace it with model_selection
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=0)
#predict
model= classifierKN()
model.fit(X_train,Y_train)
predictKN = model.predict(X)
print ("Predict result with KNeighborsClassifier")
print(predictKN)
#accuracy
print("Accuracy")
print (accuracy_score(Y, predictKN))
结果
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Vlad\Desktop\Machine learning\Machine Learning\coursework\test2.py", line 63, in <module>
predictKN = model.predict(X)
File "C:\Users\Vlad\Desktop\Machine learning\Machine Learning\coursework\test2.py", line 26, in predict
label = self.closest(row)
File "C:\Users\Vlad\Desktop\Machine learning\Machine Learning\coursework\test2.py", line 30, in closest
best_dist = euc(row, self.X_train[0])
File "E:\Anaconda\lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "E:\Anaconda\lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
答案 0 :(得分:1)
您的代码实际上同时存在多个问题,因此要理解它有点困难。 您的问题似乎主要与您对pandas Dataframes / Series的理解有关,因为您显然正在尝试使用以下方法遍历Dataframe的行:
def predict(self, X_test):
predictions = []
for row in X_test:
label = self.closest(row)
predictions.append(label)
return predictions
这不适用于熊猫。要实际遍历行的值,您将需要以下内容:
def predict(self, X_test):
predictions = []
for row in X_test.iterrows():
label = self.closest(list(row[1]))
predictions.append(label)
return predictions
此函数实际上确实遍历数据框中的行,并将行的值提供给closest()
函数。`
def closest(self, row):
best_dist = euc(row, self.X_train[0])
best_index = 0
for i in range(1, len(self.X_train)):
dist = euc(row, self.X_train[i])
if dist < best_dist:
best_dist = dist
best_index = i
return self.Y_train[best_index]
但是该功能不起作用,因为您基本上是尝试使用best_dist = euc(row, self.X_train[0])
获取row [0]的值。这只是抛出一个keyError,因为X_train是一个Dataframe并且没有第0列(无论如何您都不想索引该列)。您想要的是默认的best_dist作为输入行与数据框中第一行之间的距离。这将与类似
best_dist = euc(row, self.X_train.iloc[0])
。
然后,您需要遍历X_train中的行(此处的函数与以前一样存在问题),因此需要将其更改为类似以下内容:
def closest(self, row):
best_dist = euc(row, self.X_train.iloc[0])
best_index = 0
for i in range(1, len(self.X_train.index)):
dist = euc(row, list(self.X_train.iloc[i]))
if dist < best_dist:
best_dist = dist
best_index = i
return self.Y_train.iloc[best_index]
这至少有效。无论它是否能为您提供所需的输出和/或是否足够准确,我都无法保证,但是它确实可以解决您的直接问题。