我有一个看起来像这样的数据集
ID 885038 885039 885040 885041 885042 885043 885044 Class
1267359 2 0 0 0 0 1 0 0
1295720 0 0 0 0 0 1 0 0
1295721 0 0 0 0 0 1 0 0
1295723 0 0 0 0 0 1 0 0
1295724 0 0 0 1 0 1 0 0
1295725 0 0 0 1 0 1 0 0
1295726 2 0 0 0 0 1 0 1
1295727 2 0 0 0 0 1 0 1
1295740 0 0 0 0 0 1 0 1
1295742 0 0 0 0 0 1 0 1
1295744 0 0 0 0 0 1 0 1
1295745 0 0 0 0 0 1 0 1
1295746 0 0 0 0 0 1 0 1
为了进行递归特征消除,我遵循了步骤
以下是我为此编写的R代码,但是,它没有显示任何错误,并且循环继续训练集的长度。
data <- read.csv("dummy - Copy.csv", header = TRUE)
rownames(data) <- data[,1]
data<-data[,-1]
for (k in 1:length(data)){
inTraining <- createDataPartition(data$Class, p = .70, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
## Building the model ####
svm.model <- svm(Class ~ ., data = training, cross=10,metric="ROC",type="eps-regression",kernel="linear",na.action=na.omit,probability = TRUE)
###### auc measure #######
#prediction and ROC
svm.model$index
svm.pred <- predict(svm.model, testing, probability = TRUE)
#calculating auc
c <- as.numeric(svm.pred)
c = c - 1
pred <- prediction(c, testing$Class)
perf <- performance(pred,"tpr","fpr")
plot(perf,fpr.stop=0.1)
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
#compute the weight vector
w = t(svm.model$coefs)%*%svm.model$SV
#compute ranking criteria
weight_matrix = w * w
#rank the features
w_transpose <- t(weight_matrix)
w2 <- as.matrix(w_transpose[order(w_transpose[,1], decreasing = FALSE),])
a <- as.matrix(w2[which(w2 == min(w2)),]) #to get the rows with minimum values
row.names(a) -> remove
data<- data[,setdiff(colnames(data),remove)]
print(length(data))
length <- (length(data))
cols_names <- colnames(data)
print(auc)
output <- paste(length,auc,sep=";")
write(output, file = "output.txt",append = TRUE)
write(cols_names, file = paste(length,"cols_selected", ".txt", sep=""))
}
打印输出就像
[1] 3
[1] 0.5
[1] 2
[1] 0.5
[1] 2
[1] 0.5
[1] 2
[1] 0.75
[1] 2
[1] 1
[1] 2
[1] 0.75
[1] 2
[1] 0.5
[1] 2
[1] 0.75
但是当我选择任何功能子集时,例如功能3并使用上述代码构建SVM模型(无循环),我没有获得相同的AUC值0.75。
data <- read.csv("3.csv", header = TRUE)
rownames(data) <- data[,1]
data<-data[,-1]
inTraining <- createDataPartition(data$Class, p = .70, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
## Building the model ####
svm.model <- svm(Class ~ ., data = training, cross=10,metric="ROC",type="eps-regression",kernel="linear",na.action=na.omit,probability = TRUE)
###### auc measure #######
#prediction and ROC
svm.model$index
svm.pred <- predict(svm.model, testing, probability = TRUE)
#calculating auc
c <- as.numeric(svm.pred)
c = c - 1
pred <- prediction(c, testing$Class)
perf <- performance(pred,"tpr","fpr")
plot(perf,fpr.stop=0.1)
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
print(auc)
prints output
[1] 3
[1] 0.75 (instead of 0.5)
这两个代码都是相同的(一个带有递归循环,另一个没有任何递归循环),但同一特征子集的AUC值仍然存在差异。
两个代码的3个功能(885041
,885043
和Class
)是相同的,但是它给出了不同的 AUC 值。
答案 0 :(得分:5)
我认为仅使用交叉验证是可以的。在您的代码中,您已经使用10倍CV进行测试错误。似乎不需要拆分数据集。
由于您没有提及调整参数,因此将cost
或gamma
设置为默认值。
library(tidyverse)
library(e1071)
library(caret)
library(ROCR)
library(foreach)
特征名称是数字,似乎svm()
在拟合过程后更改了名称。为此,我将首先更改列名称。
第二,可以使用caret::creadeFolds()
而不是createDataPartition()
来指定首屏。
set.seed(1)
k <- 5 # 5-fold CV
mydf3 <-
mydf %>%
rename_at(.vars = vars(-ID, -Class), .funs = function(x) str_c("X.", x, ".")) %>%
mutate(fold = createFolds(1:n(), k = k, list = FALSE)) # fold id column
# the number of features-------------------------------
x_num <-
mydf3 %>%
select(-ID, -Class, -fold) %>%
ncol()
要进行迭代,可以使用foreach()
。
cl <- parallel::makeCluster(2)
doParallel::registerDoParallel(cl, cores = 2)
parallel::clusterExport(cl, c("mydf3", "x_num"))
parallel::clusterEvalQ(cl, c(library(tidyverse), library(ROCR)))
#---------------------------------------------------------------
svm_rank <-
foreach(j = seq_len(x_num), .combine = rbind) %do% {
mod <-
foreach(cv = 1:k, .combine = bind_rows, .inorder = FALSE) %dopar% { # parallization
tr <-
mydf3 %>%
filter(fold != cv) %>% # train
select(-fold, -ID) %>%
e1071::svm( # fitting svm
Class ~ .,
data = .,
kernel = "linear",
type = "eps-regression",
probability = TRUE,
na.action = na.omit
)
# auc
te <-
mydf3 %>%
filter(fold == cv) %>%
predict(tr, newdata = ., probability = TRUE)
predob <- prediction(te, mydf3 %>% filter(fold == cv) %>% select(Class))
auc <- performance(predob, measure = "auc")@y.values[[1]]
# ranking - your formula
w <- t(tr$coefs) %*% tr$SV
if (is.null(names(w))) colnames(w) <- attr(tr$terms, "term.labels") # when only one feature left
(w * w) %>%
tbl_df() %>%
mutate(auc = auc)
}
auc <- mean(mod %>% select(auc) %>% pull()) # aggregate cv auc
w_mat <- colMeans(mod %>% select(-auc)) # aggregate cv ranking
remove <- names(which.min(w_mat)) # minimum rank
used <-
mydf3 %>%
select(-ID, -Class, -fold) %>%
names() %>%
str_c(collapse = " & ")
mydf3 <-
mydf3 %>%
select(-remove) # remove feature for next step
tibble(used = used, delete = remove, auc = auc)
}
#---------------------------------------------------
parallel::stopCluster(cl)
对于每个步骤,您都可以获取
svm_rank
#> # A tibble: 7 x 3
#> used delete auc
#> <chr> <chr> <dbl>
#> 1 X.885038. & X.885039. & X.885040. & X.885041. & X.885042… X.88503… 0.7
#> 2 X.885038. & X.885040. & X.885041. & X.885042. & X.885043… X.88504… 0.7
#> 3 X.885038. & X.885041. & X.885042. & X.885043. & X.885044. X.88504… 0.7
#> 4 X.885038. & X.885041. & X.885043. & X.885044. X.88504… 0.7
#> 5 X.885038. & X.885041. & X.885043. X.88504… 0.7
#> 6 X.885038. & X.885041. X.88503… 0.7
#> 7 X.885041. X.88504… 0.7