答案 0 :(得分:4)
[1, 2, 3, 4, None, 6]
和[1, None, 3, 4, 5, 6]
之间的距离为sqrt(1*1 + 3*3 + 4*4 + 6*6)
。在这种情况下,您需要某种sklearn支持的自定义指标。不幸的是,您无法将空值输入到KNN fit()
方法中,因此即使使用自定义指标,您也无法完全获得所需的内容。解决方案是预先计算距离。例如:from math import sqrt, isfinite
X_train = [
[1, 2, 3, 4, None, 6],
[1, None, 3, 4, 5, 6],
y_train = [3.14, 2.72] # we're regressing something
def euclidean(p, q):
# Could also use numpy routines
return sqrt(sum((x-y)**2 for x,y in zip(p,q)))
def is_num(x):
# The `is not None` check needs to happen first because of short-circuiting
return x is not None and isfinite(x)
def restricted_points(p, q):
# Returns copies of `p` and `q` except at coordinates where either vector
# is None, inf, or nan
return tuple(zip(*[(x,y) for x,y in zip(p,q) if all(map(is_num, (x,y)))]))
def dist(p, q):
# Note that in this form you can use any metric you like on the
# restricted vectors, not just the euclidean metric
return euclidean(*restricted_points(p, q))
dists = [[dist(p,q) for p in X_train] for q in X_train]
knn = KNeighborsRegressor(
n_neighbors=1, # only needed in our test example since we have so few data points
knn.fit(dists, y_train)
X_test = [
[1, 2, 3, None, None, 6],
# We tell sklearn which points in the knn graph to use by telling it how far
# our queries are from every input. This is super inefficient.
predictions = knn.predict([[dist(q, p) for p in X_train] for q in X_test])
答案 1 :(得分:0)
import pandas as pd
df = pd.read_csv("your_data.csv")
df.dropna(inplace = True)