Question

我正在尝试使用带有R的KDD杯99数据集但不幸的是，我的结果非常糟糕。基本上，预测器是猜测（交叉验证集上约50％的错误）。我的代码中可能有一个错误，但我无法找到。

KDD杯99数据集由大约4百万个例子组成，这些例子在4种不同类别的攻击+“正常”类中分开。首先，我将数据集分成5个文件（每个类一个，“普通”类一个），我将非数字数据转换为数字数据。目前，我正在研究“远程到本地”（r2l）这个类。我根据有关该主题的论文结果选择了一些功能。之后，我采样了一些等于r2l实例数的“正常”实例，以避免类别偏斜的问题。我还用标签“攻击”替换了不同类型的r2l攻击的所有标签，所以我可以训练一个两类分类器。然后我将样本连接到一个新数据集中的r2l实例。最后，我应用10倍交叉验证来评估我使用SVM构建的模型，我在机器学习的历史中得到了最糟糕的结果...... :(

这是我的代码：

r2l <- read.table("kddcup_r2l.data",sep=",",header=T)
#u2r <- read.table("kddcup_u2r.data",sep=",",header=T)
#probe_original <- read.table("kddcup_probe.data",sep=",",header=T)
#dos <- read.table("kddcup_dos.data",sep=",",header=T)
normal <- read.table("kddcup_normal.data",sep=",",header=T)

#probe <- probe_original[sample(1:dim(probe_original)[1],10000),]

#   Features selected by the three algorithms svm, lgp and mars
#   for the different classes of attack
########################################################################

features.r2l.svm <- c("srv_count","service","duration","count","dst_host_count")
features.r2l.lgp <- c("is_guest_login","num_access_files","dst_bytes","num_failed_logins","logged_in")
features.r2l.mars <- c("srv_count","service","dst_host_srv_count","count","logged_in")
features.r2l.combined <- unique(c(features.r2l.svm,features.r2l.lgp,features.r2l.mars))



#       Sample the training set containing the normal labels
#       for each class of attack in order to have the same number 
#       of training data belonging to the "normal" class and the 
#       "attack" class
#######################################################################

normal_sample.r2l <- normal[sample(1:dim(normal)[1],dim(r2l)[1]),]


# This part was useful before the separation normal/attack because 
# attack was composed of different types for each class
######################################################################

normal.r2l.Y <- matrix(normal_sample.r2l[,c("label")]) 


#######################################################################
#       Class of attack Remote to Local (r2l)
#######################################################################

#   Select the features according to the algorithms(svm,lgp and mars)
#   for this particular type of attack. Combined contains the 
#   combination of the features selected by the 3 algorithms  
#######################################################################
#features.r2l.svm <- c(features.r2l.svm,"label")
r2l_svm <- r2l[,features.r2l.svm]
r2l_lgp <- r2l[,features.r2l.lgp]
r2l_mars <- r2l[,features.r2l.mars]
r2l_combined <- r2l[,features.r2l.combined]
r2l_ALL <- r2l[,colnames(r2l) != "label"]

r2l.Y <- matrix(r2l[,c("label")])
r2l.Y[,1] = "attack"



#   Merge the "normal" instances and the "r2l" instances and shuffle the result
############################################################################### 

r2l_svm.tr <- rbind(normal_sample.r2l[,features.r2l.svm],r2l_svm)
r2l_svm.tr <- r2l_svm.tr[sample(1:nrow(r2l_svm.tr),replace=F),]
r2l_lgp.tr <- rbind(normal_sample.r2l[,features.r2l.lgp],r2l_lgp)
r2l_lgp.tr <- r2l_lgp.tr[sample(1:nrow(r2l_lgp.tr),replace=F),]
r2l_mars.tr <- rbind(normal_sample.r2l[,features.r2l.mars],r2l_mars)
r2l_mars.tr <- r2l_mars.tr[sample(1:nrow(r2l_mars.tr),replace=F),]
r2l_ALL.tr <- rbind(normal_sample.r2l[,colnames(normal_sample.r2l) != "label"],r2l_ALL)
r2l_ALL.tr <- r2l_ALL.tr[sample(1:nrow(r2l_ALL.tr),replace=F),]

r2l.Y.tr <- rbind(normal.r2l.Y,r2l.Y)
r2l.Y.tr <- matrix(r2l.Y.tr[sample(1:nrow(r2l.Y.tr),replace=F),])

#######################################################################
#
#       10-fold CROSS-VALIDATION to assess the models accuracy
#
####################################################################### 

# CV for Remote to Local
########################    
cv(r2l_svm.tr, r2l_lgp.tr, r2l_mars.tr, r2l_ALL.tr, r2l.Y.tr)

交叉验证功能：

cv <- function(svm.tr, lgp.tr, mars.tr, ALL.tr, Y.tr){  

Jcv.svm_mean <- NULL

#Compute the size of the cross validation
# =======================================
index=sample(1:dim(svm.tr)[1])
size.CV<-floor(dim(svm.tr)[1]/10)

Jcv.svm <- NULL

#Start 10-fold Cross validation
# =============================
for (i in 1:10) {
    #   if m is the size of the training set 
    #   (nr of rows in svm.tr for example)
    #   take n observations for test and (m-n) for training
    #   with n << m (here n = m/10)
    # ===================================================
    i.ts<-(((i-1)*size.CV+1):(i*size.CV))   
    i.tr<-setdiff(index,i.ts)

    Y.tr.tr <- as.factor(Y.tr[i.tr])    
    Y.tr.ts <- as.factor(matrix(Y.tr[i.ts],ncol=1))

    svm.tr.tr <- svm.tr[i.tr,]
    svm.tr.ts <- svm.tr[i.ts,]  


    # Get the model for the algorithms 
    # ==============================================


    model.svm <- svm(Y.tr.tr~.,svm.tr.tr,type="C-classification")

    # Compute the prediction 
    # ==============================================
    Y.hat.ts.svm <- predict(model.svm,svm.tr.ts)

    # Compute the error 
    # ==============================================

    h.svm <- NULL

    h.svm <- matrix(Y.hat.ts.svm,ncol=1)

    Jcv.svm <- c(Jcv.svm ,sum(!(h.svm == Y.tr.ts))/size.CV)
    print(table(h.svm,Y.tr.ts)) 

}

Jcv.svm_mean <- c(Jcv.svm_mean, mean(Jcv.svm))

d <- 10
print(paste("Jcv.svm_mean: ", round(Jcv.svm_mean,digits=d) ))   
}

我得到了非常奇怪的结果。似乎算法并没有真正看到实例之间的任何差异。看起来不仅仅是预测。我也尝试了攻击类“探测”，但获得了相同的结果。我之前提到的论文在r2l类上有30％的准确率，在探测器上有60-98％（取决于多项式度）。

以下是交叉验证的10倍之一的预测：

h.svm（攻击）＆amp; Y.tr.ts（攻击） - ＆gt; 42个实例

h.svm（攻击）＆amp; Y.tr.ts（正常。） - ＆gt; 44个实例

h.svm（正常。）＆amp; Y.tr.ts（攻击） - ＆gt; 71个实例

h.svm（正常。）＆amp; Y.tr.ts（正常。） - ＆gt; 68个实例

如果有人能告诉我我的代码有什么问题，我将非常感激。

提前谢谢

Answer 1

无法确定这是否是您的问题，但该数据集存在已知问题。 http://www.bruggerink.com/~zow/GradSchool/KDDCup99Harmful.html 抱歉，我无法帮助代码，我不知道R：/

使用KDD cup 99数据集和机器学习与R

1 个答案: