使用sklearn并尝试通过以下代码评估KNN回归函数:
def cross_validate(X,y,n_neighbors, test_size=0.20):
training_mses = []
test_mses = []
n = X.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = np.arange(n)
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
X_test, y_test = X[test_indices], y[test_indices]
X_train,y_train = X[training_indices], y[training_indices]
knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights = "distance",
algorithm = 'brute')
model = knn.fit(X_train,y_train)
y_hat = model.predict( X_train)
training_mse = mse( y_train - y_hat)
model2 = knn.fit(X_test,y_test)
y_hat = model2.predict( X_test)
test_mse = mse( y_test - y_hat)
return training_mse, test_mse
我对线性回归做了类似的事情。我发现的区别是,当我在KNN回归上运行它时,training_mse和test_mse都为0。如果我在装有训练集的模型上使用测试数据,它会给我一个非零的mse值。但是我只是不相信训练和测试集的拟合值与观察值相同。我究竟做错了什么?我尝试模拟的功能如下,并为mse提供了非零值:
def cross_validate( formula, data, test_size=0.20):
training_mses = []
test_mses = []
n = data.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = deepcopy( data.index).values
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
test_set = data.ix[ test_indices]
training_set = data.ix[ training_indices]
y, X = patsy.dmatrices( formula, training_set, return_type="matrix")
model = linear.LinearRegression( fit_intercept=False).fit( X, y)
y_hat = model.predict( X)
training_mse = mse( y - y_hat)
y, X = patsy.dmatrices( formula, test_set, return_type="matrix")
y_hat = model.predict( X)
test_mse = mse( y - y_hat)
return training_mse, test_mse