我将数据分为训练和测试集。我在训练集上进行了引导,并且需要对测试集进行验证测试。我该如何实现?是否通过混淆矩阵进行比较?如果是,请您指教吗?
这是访问数据集的共享链接: https://drive.google.com/open?id=11LzPjH8RQraOI0eOYJRVRwgnRGL6Bnic
library(tidyverse)
library(caret)
mydata <- read.csv("C:/Users/User/Desktop/FYP/FYP2/data.csv")
# create training data
mydata_ones <- mydata[which(mydata$INJ_FAT == 1), ]
mydata_zeros <- mydata[which(mydata$INJ_FAT == 0), ]
set.seed(100) #for repeatability of samples
mydata_ones_training_rows <- sample(1:nrow(mydata_ones), 0.8*nrow(mydata_ones))
mydata_zeros_training_rows <- sample(1:nrow(mydata_zeros),0.8*nrow(mydata_zeros))
training_ones <- mydata_ones[mydata_ones_training_rows, ]
training_zeros <- mydata_zeros[mydata_zeros_training_rows, ]
train.data <- rbind(training_ones, training_zeros) # row bind the 1's and 0's
#print(trainingData)
# create test data
test_ones <- mydata_ones[-mydata_ones_training_rows, ]
test_zeros <- mydata_zeros[-mydata_zeros_training_rows, ]
test.data <- rbind(test_ones, test_zeros)
library(boot)
x <- model.matrix(~., train.data)
logit.bootstrap <- function(data, indices) {
d <- data[indices, ]
fit <- glm(INJ_FAT~., data = d, family = "binomial")
return(coef(fit))
}
set.seed(12345)
logit.boot <- boot(data=as.data.frame(x), statistic=logit.bootstrap, R=3500)
logit.boot
答案 0 :(得分:0)
在启动函数中,仅保留回归系数,因此,要进行任何类型的验证,都需要获取预测的概率。首先,我在下面运行10个引导程序,请注意,您要么使用模型矩阵,要么使用公式和data.frame,但不能同时使用这两个,在您的代码中,您将得到2个截距:
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"export": "next export"
},
"dependencies": {
"@zeit/next-css": "^1.0.1",
"@zeit/next-sass": "^1.0.1",
"antd": "^3.26.8",
"chartjs": "^0.3.24",
"classnames": "^2.2.6",
"draft-js": "^0.11.4",
"isomorphic-unfetch": "^3.0.0",
"moment": "^2.24.0",
"next": "^9.2.1",
"node-sass": "^4.13.1",
"react": "16.12.0",
"react-dom": "16.12.0",
"react-helmet": "^5.2.1",
"react-markdown": "^4.3.1",
"react-mde": "^8.1.0",
"react-redux": "^7.2.0",
"react-select": "^3.0.8",
"react-slick": "^0.25.2",
"react-toastify": "^5.5.0",
"redux": "^4.0.5",
"redux-devtools-extension": "^2.13.8",
"redux-thunk": "^2.3.0",
"showdown": "^1.9.1",
"slick-carousel": "^1.8.1"
},
"devDependencies": {
"eslint": "^6.8.0",
"eslint-loader": "^3.0.3",
"eslint-plugin-react": "^7.18.3",
"url-loader": "^3.0.0"
}
您的系数存储在这里,每个引导程序存储1行,每列1个系数:
library(tidyverse)
library(caret)
set.seed(100)
mydata <- read.csv("data.csv")
idx = createDataPartition(mydata$INJ_FAT,p=0.8)
train.data <- mydata[idx$Resample1,]
test.data <- mydata[-idx$Resample1,]
library(boot)
set.seed(12345)
logit.boot <- boot(data=train.data, statistic=logit.bootstrap, R=10)
对于1个引导程序,要获得预测概率,请执行以下操作:
head(logit.boot$t)
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] -4.271000 1.1001241 -1.4136104 -1.621620 -2.584495 5.374047 -2.691607
[2,] -5.048106 1.6833989 -0.2461192 -2.053468 -1.937496 5.608855 -2.415466
[3,] -8.152342 0.9078029 -1.2023567 -1.102740 -2.585418 5.462476 -2.304434
[4,] -6.254665 1.1466750 -0.5599730 -2.132731 -3.401947 4.939235 -17.332697
要为所有引导程序收集它,我们对所有boostraps进行矩阵乘法:
logodds_to_pred = function(pred,levels){
ifelse(exp(pred)/(1+exp(pred))>0.5,levels[2],levels[1])
}
predictions_b1 = model.matrix(INJ_FAT~.,data=test.data) %*% logit.boot$t[1,]
# convert to 0/1, if prob > 0.5 it's 1 else 0
predictions_b1 = logodds_to_pred(predictions_b1,c(0,1))
confusionMatrix(table(predictions_b1,test.data$INJ_FAT))
Confusion Matrix and Statistics
predictions_b1 0 1
0 544 27
1 10 91
Accuracy : 0.9449
95% CI : (0.9249, 0.9609)
No Information Rate : 0.8244
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7984
Mcnemar's Test P-Value : 0.008529
对于每个引导程序(列),我们进行混淆矩阵并得出摘要:
logodds = model.matrix(INJ_FAT~.,data=test.data) %*% t(logit.boot$t)
predictions = apply(logodds,2,logodds_to_pred,level=c(0,1))
不太确定如何在许多引导程序中合并结果,但是我想上面的内容可以继续进行下去。