当我在kNN算法(R)中给出k值时,获得错误“k = 0必须至少为1”

时间:2017-05-03 23:30:54

标签: r knn

我什么时候放这行代码:

> m1 <- knn( train = trainSetNorm[,c(1:41)], test = testSetNorm[,c(1:41)], 
cl = trainSetNorm[,c(42)], k = 703)

我收到以下错误:

Error in knn(train = trainSetNorm[, c(1:41)], test = testSetNorm[, c(1:41)],: 
k = 0 must be at least 1







我使用校正的KDD Cup 99作为校正的10%数据集。使用10%并且将训练数据和校正数据用作测试数据。这是确切的代码:

setwd("C:/Users/admin/Desktop/BIGLOU")

加载数据集

testSet <- read.delim('corrected', sep = ',', header = FALSE)
colnames(testSet) <- c("duration", "protocol_type", "service", "flag", 
"src_bytes", "dst_bytes", "land", 
                    "wrong_fragment", "urgent", "hot", "num_failed_logins", 
"logged_in", "num_compromised",
                    "root_shell", "su_attempted", "num_root", 
"num_file_creations", "num_shells", 
                    "num_access_files", "num_outbound_cmds", 
"is_host_login","is_guest_login", "count", "srv_count", 
                    "serror_rate", "srv_serror_rate", "rerror_rate", 
"srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
                    "srv_diff_host_rate", "dst_host_count", 
"dst_host_srv_count", "dst_host_same_srv_rate", 
                    "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
"dst_host_srv_diff_host_rate",
                    "dst_host_serror_rate", "dst_host_srv_serror_rate", 
"dst_host_rerror_rate", "dst_host_srv_rerror_rate",
                    "connection_type")

trainSet <- read.delim('kddcup.data_10_percent_corrected', sep = ',', header = FALSE)
colnames(trainSet) <- c("duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", 
                    "wrong_fragment", "urgent", "hot", "num_failed_logins", 
"logged_in", "num_compromised",
                    "root_shell", "su_attempted", "num_root", 
"num_file_creations", "num_shells", 
                    "num_access_files", "num_outbound_cmds", 
"is_host_login","is_guest_login", "count", "srv_count", 
                    "serror_rate", "srv_serror_rate", "rerror_rate", 
"srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
                    "srv_diff_host_rate", "dst_host_count", 
"dst_host_srv_count", "dst_host_same_srv_rate", 
                    "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
"dst_host_srv_diff_host_rate",
                    "dst_host_serror_rate", "dst_host_srv_serror_rate", 
"dst_host_rerror_rate", "dst_host_srv_rerror_rate",
                    "connection_type")

将Catagorical值更改为数值。

testSet[,c(2)] <- as.integer(testSet[,c(2)])
testSet[,c(3)] <- as.integer(testSet[,c(3)])
testSet[,c(4)] <- as.integer(testSet[,c(4)])
testSet[,c(7)] <- as.integer(testSet[,c(7)])
testSet[,c(12)] <- as.integer(testSet[,c(12)])
testSet[,c(21)] <- as.integer(testSet[,c(21)])
testSet[,c(22)] <- as.integer(testSet[,c(22)])

trainSet[,c(2)] <- as.integer(trainSet[,c(2)])
trainSet[,c(3)] <- as.integer(trainSet[,c(3)])
trainSet[,c(4)] <- as.integer(trainSet[,c(4)])
trainSet[,c(7)] <- as.integer(trainSet[,c(7)])
trainSet[,c(12)] <- as.integer(trainSet[,c(12)])
trainSet[,c(21)] <- as.integer(trainSet[,c(21)])
trainSet[,c(22)] <- as.integer(trainSet[,c(22)])

随机化数据集

set.seed(60223)
rand <-runif(nrow(testSet))
testSet <- testSet[order(rand),]
set.seed(12558)
rand <-runif(nrow(trainSet))
trainSet <- trainSet[order(rand),]

规范化输入数据

normalize <- function(x) {return( abs((x - min(x))/(max(x) - min(x))))}
testSetNorm <- as.data.frame(lapply(testSet[,c(1:41)],normalize))
trainSetNorm <- as.data.frame(lapply(trainSet[,c(1:41)],normalize))

testSetNorm <-cbind(testSetNorm, testSet[,c(42)])
colnames(testSet)[42] <- "connection_type"
trainSetNorm <-cbind(trainSetNorm, trainSet[,c(42)])
colnames(trainSet)[42] <- "connection_type"

删除所有缺失的数据点

trainSetNorm <- trainSetNorm[complete.cases(trainSetNorm),]
testSetNorm <- testSetNorm[complete.cases(testSetNorm),]


require(class)
m1 <- knn( train = trainSetNorm[,c(1:41)], test = testSetNorm[,c(1:41)], cl = trainSetNorm[,c(42)], k = 703)

0 个答案:

没有答案