这是一个重现我的问题的示例代码。实际上,我有一个更大的数据集,其中包含更多的变量。
我正在尝试使用插入符号(训练)来运行一些svm模型。我发现使用矩阵接口时,与公式接口相比,得到的简历结果不同。似乎在矩阵预处理期间,yeojohnson变换未应用于虚拟编码变量,而在公式界面中则为。
进一步尝试使用矩阵界面进行预测时,此操作无效。
我想做的是从这两种方法中得到相同的结果,并了解为什么当前存在差异!
library(kernlab)
library(caret)
# Normal version of sample data with 1 numeric and 1 factor variable.
trainset <- data.frame(
class=factor(c("Good", "Bad", "Good", "Good", "Bad", "Good", "Good", "Good", "Good", "Bad", "Bad", "Bad")),
age=c(67, 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 24),
hair = c("Brown", "Black","Brown", "Black","Brown", "Black","Brown", "Black","Brown", "Black","Brown", "Black"))
trainset.class = trainset$class
testset <- data.frame(
class=factor(c("Good", "Bad", "Good" )),
age=c(64, 23, 50),
hair = c("Brown", "Brown","Black"))
testset.class = testset$class
# Dummified version of sample with full rank = T
Formula = dummyVars("~.",data=trainset[,-1], fullRank=T)
dummy.trainset = as.data.frame(predict(Formula,trainset))%>%
cbind(trainset.class)
Formula.test = dummyVars("~.",data=testset[,-1], fullRank=T)
dummy.testset = as.data.frame(predict(Formula.test,testset))%>%
cbind(testset.class)
# Now run modelling with formula and matrix interfaces on DUMMIFIED VERSION as svm needs this variables dummy coded.
set.seed(1)
svmFit.matrix <- train(x = select(dummy.trainset,-trainset.class), y = trainset.class,
method = "svmRadial",
preProc = c("center", "scale", "YeoJohnson"),
tuneLength = 3,
trControl = trainControl(method = "cv", classProbs = TRUE))
svmFit.matrix
set.seed(1)
svmFit.formula <- train(trainset.class~.,
data = dummy.trainset,
method = "svmRadial",
preProc = c("center", "scale", "YeoJohnson"),
tuneLength = 3,
trControl = trainControl(method = "cv", classProbs = TRUE))
svmFit.formula
### predict probablities using formula and matrix interface
predictedProbs.matrix <- predict(svmFit.matrix, dummy.testset , type = "prob")
head(predictedProbs.matrix)
predictedProbs.formula <- predict(svmFit.formula, dummy.testset , type = "prob")
head(predictedProbs.formula)