我正在Kaggle(https://www.kaggle.com/karangadiya/fifa19)上对FIFA 2019数据集进行KNN分析 消除N / A值后,该数据集有18159行。
我正在尝试基于各种特征(例如耐力,射击能力,完成度,总体(他们的评分)等)来预测玩家是否是“进攻者”。
如果玩家的位置是LS,ST,RS,RF,LF或CF,我们会将其视为攻击者。
我很困惑为什么错误率会随着K的数量增加而增加(我绘制了10个最近的邻居)。我认为通常是相反的,当您考虑更多邻居时,错误率趋于下降?
有人认为这种方法有问题吗,还是可以解释为什么他们认为错误率在增加?
机器学习的新手,任何见识都值得赞赏
sampledata<-data
sampledata$indicator = "No"
for(i in 1:NROW(sampledata)){
if (sampledata$Position[i] == "ST" | sampledata$Position[i] == "LS" | sampledata$Position[i] == "RS"| sampledata$Position[i] == "RF" | sampledata$Position[i] == "LF" | sampledata$Position[i] == "CF"){
sampledata$indicator[i] = "Yes"
}
}
print(head(sampledata))
sampledata = sampledata[,c("Overall","Finishing","Positioning","ShotPower","Balance","Stamina","Strength","indicator")]
sampledata <- na.omit(sampledata) #eliminate any na values
any(is.na(sampledata))
sampledata$Overall <- as.numeric(sampledata$Overall)
sampledata$Finishing <- as.numeric(sampledata$Finishing)
sampledata$Positioning <- as.numeric(sampledata$Positioning)
sampledata$ShotPower <- as.numeric(sampledata$ShotPower)
sampledata$Balance <- as.numeric(sampledata$Balance)
sampledata$Stamina <- as.numeric(sampledata$Stamina)
sampledata$Strength <- as.numeric(sampledata$Strength)
sampledata$indicator <- as.numeric(as.factor((sampledata$indicator)))
library(caTools)
sample = sample.split(sampledata, SplitRatio = .80) #take random sample of 80% of rows to make training data
test_knn <- sampledata[-sample,]
test_attacker <- sampledata$indicator[-sample]
train_knn <- sampledata[sample,]
train_attacker <- sampledata$indicator[sample]
#Choose a K value
library(class)
predicted.attacker <- NULL
error.rate <- NULL
for (i in 1:10){
set.seed(101)
predicted.attacker <- knn(train_knn,test_knn,train_attacker,k=i)
error.rate[i] <- mean(test_attacker != predicted.attacker)
}
print(error.rate)
#visualize K elbow method
library(ggplot2)
k.values <- 1:10
error.df <- data.frame(error.rate,k.values)
print(error.df) #shows error rate for each k value
ggplot(error.df,aes(k.values,error.rate)) + geom_point() +
geom_line(lty='dotted',color='red')