我正在R中学习knn。首先我加载了一个csv文件(https://da5020.weebly.com/uploads/8/6/5/9/8659576/franchisesales.csv)
data <- read.csv("franchisesales.csv", stringsAsFactors = FALSE)
然后将其实现为knn函数:
n <- nrow(data)
# dist - calculates the Euclidean distance between
# two vectors containing numeric elements
dist <- function(dataset,x){
d <- 0
for (i in 1:n) {
d <- d + (dataset[1]-x[1])^2+(dataset[2]-x[2])^2+(dataset[3]-x[3])^2+(dataset[4]-x[4])^2+(dataset[5]-x[5])^2
}
distance <- sqrt(d)
}
# neighbor - returns a vector of distances between an object u
# and a set data frame of features
neighbor <- function (dataset,x){
ds <- numeric(n)
for (i in 1:n) {
ds[i] <- dist(dataset[i,],x)
}
neighbors <- ds
unlist(neighbors)
}
# k.closest - finds the smallest k values in a vector of values
k.closest <- function (neighbor,k){
ordered.neighbors <- order(neighbor)
k.closest <- ordered.neighbors[1:k]
}
# knn - finds the most likely class that an unknown object u
# belongs to based on a training data frame of features and
# a provided k
knn <- function (dataset, y, x, k)
{
nb <- neighbor(dataset,x)
f <- k.closest(nb,k)
kNN <- mean(y[f])
}
我想通过预测数据集中的每个实际值并将其与实际观察值进行比较来计算kNN的均方误差(MSE),所以
for (i in 1:nrow(data)) {
data$pred.new[i] <- knn(data, data$NetSales, data[i, 2:6], 3)
data$errs[i] <- data$NetSales[i] - data$pred.new[i]
}
MSE.new <- mean(data$errs^2)
但是我注意到data $ pred.new都是相同的值26.1667。为什么会这样?如何解决?感谢您的帮助!
请注意:自发布以来,该代码已被稍作修改。