我是机器学习的新手,我成功建立了一个KNN分类器。现在我想实现n交叉验证,但在R中花费的时间太长了。是否有更有效的方法?
下面是我的代码(现在已经运行了30分钟......):
require(class)
set.seed(2095)
#dataset source:https://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html
normalize<-function(x){
return ((x - min(x)) / (max(x)-min(x)))
}
#removed duplicate data and label information that are not "normal." as "attack"
dataset <- read.csv("data/kdd_data_10pc_cleansed_removed_dup.csv",header=FALSE,sep=",")
names <- read.csv("data/kdd_names.csv",header=FALSE,sep=";")
names(dataset) <- sapply((1:nrow(names)),function(i) toString(names[i,1]))
#extracting relevant features
dataset_extracted<-dataset[,c("duration", "src_bytes","dest_bytes","land","wrong_fragments","count", "diff_srv_rate", "dst_host_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_serror_rate", "under_attack")]
#shuffling of data randomly
rand_sorter = runif(nrow(dataset_extracted))
dataset_extracted <-dataset_extracted[order(rand_sorter),]
#normalizing of data from column 1-11 to value of 0 to 1
dataset_normalized <-as.data.frame(lapply(dataset_extracted[,c(1,2,3,4,5,6,7,8,9,10,11)], normalize))
folds <-cut(seq(1,nrow(dataset_normalized)), breaks=10, labels=FALSE)
each_k_error = NULL
for(j in 1:145586){
avg_error = NULL
for(i in 1:10){
testIndexes <- which(folds==i, arr.ind=TRUE)
testData <- dataset_normalized[testIndexes, ]
trainData <- dataset_normalized[-testIndexes,]
tempM = knn(train = trainData, test = testData, cl = dataset_extracted[-testIndexes,5], k = j)
tempTestTarget <- dataset_extracted[testIndexes,5]
tempTable = table(tempTestTarget, tempM)
#use row percentage to count error
error_per_class = diag(prop.table(tempTable,1))
avg_error <- c(avg_error, mean(error_per_class))
#avg_error = avg_error + mean(error_per_class)
}
each_k_error <- c(each_k_error, list(j, mean(avg_error)))
}