我使用caret :: train来训练11个模型,数据集包含32000个obs和250个变量。我在Linux机器上使用R.
该软件已经大幅放缓。我将所有模型存储在一个列表中,并写入磁盘,大约需要1.3Gb。
有没有人有使用R创建/使用RAM磁盘的经验? RAM磁盘会对性能产生重大影响吗?
我猜我正在达到交换内存限制。关于如何加速R的任何其他建议?
我想完成预测/性能命令,同时绘制测试ROC,并列出11个模型的ROC性能。另外,我想为其他几个因变量运行该过程。
我相信有80 Gb的内存,而且有7个内核。我在训练模型的同时使用了6个核心并行处理,但森林模型花了几个小时来训练。
代码类似于gripComp from this code sample,除了cor用于在训练之前删除相关性大于0.9的自变量。我没有要共享的数据集,但是大约有30000个obs和250个自变量,以及大约20个因变量,但我只是一次处理一个因变量。
伪代码。主要区别在于模型在计算时保存到磁盘,并检查磁盘以查看模型是否先前已计算过。此外,我启用了多核,并且未在此代码示例中列出。运行培训大约需要30个小时,共有6个核心。我等了约3小时的预测/性能命令,然后回家了。
caretApproach <- function(df, cName, l=3, m="cv", n=3/4,
ostats="gripComp.txt", oplot="gripComp.png",
pw=700, ph=700, plty=1, secAllowed=5,
useParellel=TRUE){
inTrain <- createDataPartition(df$INCOME, p = .8,
list = FALSE,
times = 1)
set.seed(1)
# browser()
indep <- subset(df, select = setdiff(names(df),c(cName)))
classes <- subset(df, select = c(cName))
classes <- unlist(classes) # this converts to a vector
d_cor <- as.matrix(cor(indep))
# I don't remember this code fragment, but cor is used to removed correlated columns of classes
trainDescr <- indep[inTrain,]
testDescr <- indep[-inTrain,]
trainClass <- classes[inTrain]
testClass <- classes[-inTrain]
set.seed(2)
###1 Recursive partitioning rpart rpart
rpartFit <- train(trainDescr, trainClass, method='rpart', tuneLength = l,trControl = trainControl(method = m))
###2 Recursive partitioning ctree2 party
ctreeFit <- train(trainDescr, trainClass, method='ctree2', tuneLength = l,trControl = trainControl(method = m))
###3 Random forests rf randomForest
rfFit <- train(trainDescr, trainClass, method='rf', tuneLength = l,trControl = trainControl(method = m))
###4 Random forests cforest party
cforestFit <- train(trainDescr, trainClass, method='cforest', tuneLength = l,trControl = trainControl(method = m))
###5 Bagging treebag ipred
treebagFit <- train(trainDescr, trainClass, method='treebag', tuneLength = l,trControl = trainControl(method = m))
###6 Neural networks nnet nnet
nnetFit <- train(trainDescr, trainClass, method='nnet', tuneLength = l,trControl = trainControl(method = m))
###7 Support vector machines svmRadial kernlab
svmRadialFit <- train(trainDescr, trainClass, method='svmRadial', tuneLength = l,trControl = trainControl(method = m))
###8 Support vector machines svmLinear kernlab
svmLinearFit <- train(trainDescr, trainClass, method='svmLinear', tuneLength = l,trControl = trainControl(method = m))
###9 k nearest neighbors knn caret
knnFit <- train(trainDescr, trainClass, method='knn', tuneLength = l,trControl = trainControl(method = m))
###10 Generalized linear model glm stats
glmFit <- train(trainDescr, trainClass, method='glm', tuneLength = l,trControl = trainControl(method = m))
###11 Logistic/Multinomial Regression multinom nnet
# I have an if statement here to check if trainDescr is factor
multinomFit <- train(trainDescr, trainClass, method='multinom', tuneLength = l,trControl = trainControl(method = m))
### models
models <- list(rpart=rpartFit, ctree2=ctreeFit, rf=rfFit, cforest=cforestFit, treebag=treebagFit, nnet = nnetFit, svmRadial = svmRadialFit, svmLinear = svmLinearFit, knn = knnFit, glm = glmFit, multinom=multinomFit)
# It takes about 30 hrs of clock time to get to this point
# At this point, models was written to the disk, and the file size is 1.3 GB
save(models,file='models.RData')
### predict values
predValues <- extractPrediction(models, testX = testDescr, testY = testClass)
testValues <- subset(predValues, dataType == "Test")
### predict probability
probValues <- extractProb(models, testX = testDescr, testY = testClass)
testProbs <- subset(probValues, dataType == "Test")
# I waited about 3 hours for the above 4 lines, then went home
# The rest of the code may need some correcting
############stats
###1 rpart
rpartPred <- subset(testValues, model == "rpart")
x <- confusionMatrix(rpartPred$pred, rpartPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "rpart")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x1 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
png(filename = oplot, width = pw, height = ph, units = "px")
plotColors <-colors()[c(9,17,19,24,27,33,51,62,84,254,552)]
plot(rocCurve, col=plotColors[1], main="", lty=plty)
legend('bottomright', c('rpart', 'ctree2', 'rf', 'cforest', 'treebag', 'nnet', 'svmRadial', 'svmLinear', 'knn', 'glm', 'multinom'), pch=c(15), col=plotColors)
###2 ctree2
ctreePred <- subset(testValues, model == "ctree2")
x=confusionMatrix(ctreePred$pred, ctreePred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "ctree2")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x2 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[2], main="", lty=plty, add=TRUE)
###3 rf
rfPred <- subset(testValues, model == "rf")
x=confusionMatrix(rfPred$pred, rfPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "rf")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x3 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[3], main="", lty=plty, add=TRUE)
###4 cforest
cforestPred <- subset(testValues, model == "cforest")
x=confusionMatrix(cforestPred$pred, cforestPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "cforest")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x4 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[4], main="", lty=plty, add=TRUE)
###5 treebag
treebagPred <- subset(testValues, model == "treebag")
x=confusionMatrix(treebagPred$pred, treebagPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "treebag")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x5 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[5], main="", lty=plty, add=TRUE)
###6 nnet
nnetPred <- subset(testValues, model == "nnet")
x=confusionMatrix(nnetPred$pred, nnetPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "nnet")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x6 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[6], main="", lty=plty, add=TRUE)
###7 svmRadial
svmRadialPred <- subset(testValues, model == "svmRadial")
x=confusionMatrix(svmRadialPred$pred, svmRadialPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "svmRadial")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x7 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[7], main="", lty=plty, add=TRUE)
###8 svmLinear
svmLinearPred <- subset(testValues, model == "svmLinear")
x=confusionMatrix(svmLinearPred$pred, svmLinearPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "svmLinear")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x8 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[8], main="", lty=plty, add=TRUE)
###9 knn
knnPred <- subset(testValues, model == "knn")
x=confusionMatrix(knnPred$pred, knnPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "knn")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x9 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[9], main="", lty=plty, add=TRUE)
###10 glm
glmPred <- subset(testValues, model == "glm")
x=confusionMatrix(glmPred$pred, glmPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "glm")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x10 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[10], main="", lty=plty, add=TRUE)
###11 multinom
# Needs if statement to check for factors
multinomPred <- subset(testValues, model == "multinom")
x=confusionMatrix(multinomPred$pred, multinomPred$obs)
tp <- x$table[1,1]
fn <- x$table[2,1]
fp <- x$table[1,2]
tn <- x$table[2,2]
acc <- (tp+tn)/(tp+fn+fp+tn)
sens <- tp/(tp+fn)
spec <- tn/(tn+fp)
phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
myProb <- subset(testProbs, model == "multinom")
rocrObject <- prediction(myProb$Control, myProb$obs)
rocCurve <- performance(rocrObject,"tpr","fpr")
modelAUC <- performance(rocrObject,"auc")@y.values
x11 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)
plot(rocCurve, col=plotColors[11], main="", lty=plty, add=TRUE)
dev.off()
### output
b <- c(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11)
s <- matrix(b, 11, 9, byrow=T, dimnames=list(c("rpart", "ctree2", "rf", "cforest", "treebag", "nnet", "svmRadial", "svmLinear", "knn", "glm", "multinom"), c("TP", "FN", "FP", "TN", "AUC", "Accuracy", "Sensitivity", "Specificity", "phi")))
write.table(s, ostats)
write.table(s)
return(s)