我什么时候放这行代码:
> m1 <- knn( train = trainSetNorm[,c(1:41)], test = testSetNorm[,c(1:41)],
cl = trainSetNorm[,c(42)], k = 703)
我收到以下错误:
Error in knn(train = trainSetNorm[, c(1:41)], test = testSetNorm[, c(1:41)],:
k = 0 must be at least 1
我使用校正的KDD Cup 99作为校正的10%数据集。使用10%并且将训练数据和校正数据用作测试数据。这是确切的代码:
setwd("C:/Users/admin/Desktop/BIGLOU")
加载数据集
testSet <- read.delim('corrected', sep = ',', header = FALSE)
colnames(testSet) <- c("duration", "protocol_type", "service", "flag",
"src_bytes", "dst_bytes", "land",
"wrong_fragment", "urgent", "hot", "num_failed_logins",
"logged_in", "num_compromised",
"root_shell", "su_attempted", "num_root",
"num_file_creations", "num_shells",
"num_access_files", "num_outbound_cmds",
"is_host_login","is_guest_login", "count", "srv_count",
"serror_rate", "srv_serror_rate", "rerror_rate",
"srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
"srv_diff_host_rate", "dst_host_count",
"dst_host_srv_count", "dst_host_same_srv_rate",
"dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate",
"dst_host_serror_rate", "dst_host_srv_serror_rate",
"dst_host_rerror_rate", "dst_host_srv_rerror_rate",
"connection_type")
trainSet <- read.delim('kddcup.data_10_percent_corrected', sep = ',', header = FALSE)
colnames(trainSet) <- c("duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
"wrong_fragment", "urgent", "hot", "num_failed_logins",
"logged_in", "num_compromised",
"root_shell", "su_attempted", "num_root",
"num_file_creations", "num_shells",
"num_access_files", "num_outbound_cmds",
"is_host_login","is_guest_login", "count", "srv_count",
"serror_rate", "srv_serror_rate", "rerror_rate",
"srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
"srv_diff_host_rate", "dst_host_count",
"dst_host_srv_count", "dst_host_same_srv_rate",
"dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate",
"dst_host_serror_rate", "dst_host_srv_serror_rate",
"dst_host_rerror_rate", "dst_host_srv_rerror_rate",
"connection_type")
将Catagorical值更改为数值。
testSet[,c(2)] <- as.integer(testSet[,c(2)])
testSet[,c(3)] <- as.integer(testSet[,c(3)])
testSet[,c(4)] <- as.integer(testSet[,c(4)])
testSet[,c(7)] <- as.integer(testSet[,c(7)])
testSet[,c(12)] <- as.integer(testSet[,c(12)])
testSet[,c(21)] <- as.integer(testSet[,c(21)])
testSet[,c(22)] <- as.integer(testSet[,c(22)])
trainSet[,c(2)] <- as.integer(trainSet[,c(2)])
trainSet[,c(3)] <- as.integer(trainSet[,c(3)])
trainSet[,c(4)] <- as.integer(trainSet[,c(4)])
trainSet[,c(7)] <- as.integer(trainSet[,c(7)])
trainSet[,c(12)] <- as.integer(trainSet[,c(12)])
trainSet[,c(21)] <- as.integer(trainSet[,c(21)])
trainSet[,c(22)] <- as.integer(trainSet[,c(22)])
随机化数据集
set.seed(60223)
rand <-runif(nrow(testSet))
testSet <- testSet[order(rand),]
set.seed(12558)
rand <-runif(nrow(trainSet))
trainSet <- trainSet[order(rand),]
规范化输入数据
normalize <- function(x) {return( abs((x - min(x))/(max(x) - min(x))))}
testSetNorm <- as.data.frame(lapply(testSet[,c(1:41)],normalize))
trainSetNorm <- as.data.frame(lapply(trainSet[,c(1:41)],normalize))
testSetNorm <-cbind(testSetNorm, testSet[,c(42)])
colnames(testSet)[42] <- "connection_type"
trainSetNorm <-cbind(trainSetNorm, trainSet[,c(42)])
colnames(trainSet)[42] <- "connection_type"
删除所有缺失的数据点
trainSetNorm <- trainSetNorm[complete.cases(trainSetNorm),]
testSetNorm <- testSetNorm[complete.cases(testSetNorm),]
require(class)
m1 <- knn( train = trainSetNorm[,c(1:41)], test = testSetNorm[,c(1:41)], cl = trainSetNorm[,c(42)], k = 703)