Question

我使用caret :: train来训练11个模型，数据集包含32000个obs和250个变量。我在Linux机器上使用R.

该软件已经大幅放缓。我将所有模型存储在一个列表中，并写入磁盘，大约需要1.3Gb。

有没有人有使用R创建/使用RAM磁盘的经验？ RAM磁盘会对性能产生重大影响吗？

我猜我正在达到交换内存限制。关于如何加速R的任何其他建议？

我想完成预测/性能命令，同时绘制测试ROC，并列出11个模型的ROC性能。另外，我想为其他几个因变量运行该过程。

我相信有80 Gb的内存，而且有7个内核。我在训练模型的同时使用了6个核心并行处理，但森林模型花了几个小时来训练。

代码类似于gripComp from this code sample，除了cor用于在训练之前删除相关性大于0.9的自变量。我没有要共享的数据集，但是大约有30000个obs和250个自变量，以及大约20个因变量，但我只是一次处理一个因变量。

伪代码。主要区别在于模型在计算时保存到磁盘，并检查磁盘以查看模型是否先前已计算过。此外，我启用了多核，并且未在此代码示例中列出。运行培训大约需要30个小时，共有6个核心。我等了约3小时的预测/性能命令，然后回家了。

caretApproach <- function(df, cName, l=3, m="cv", n=3/4, 
                          ostats="gripComp.txt", oplot="gripComp.png", 
                          pw=700, ph=700, plty=1, secAllowed=5,
                          useParellel=TRUE){

  inTrain <- createDataPartition(df$INCOME, p = .8,
                                 list = FALSE,
                                 times = 1)
  set.seed(1)

  # browser()
  indep <- subset(df, select = setdiff(names(df),c(cName)))
  classes <- subset(df, select = c(cName))
  classes <- unlist(classes)  # this converts to a vector

  d_cor <- as.matrix(cor(indep))
  # I don't remember this code fragment, but cor is used to removed correlated columns of classes

  trainDescr <- indep[inTrain,]
  testDescr <- indep[-inTrain,]
  trainClass <- classes[inTrain]
  testClass <- classes[-inTrain]

set.seed(2)
###1 Recursive partitioning  rpart rpart
rpartFit <- train(trainDescr, trainClass, method='rpart', tuneLength = l,trControl = trainControl(method = m))
###2 Recursive partitioning  ctree2 party
ctreeFit <- train(trainDescr, trainClass, method='ctree2', tuneLength = l,trControl = trainControl(method = m))
###3 Random forests  rf  randomForest
rfFit <- train(trainDescr, trainClass, method='rf', tuneLength = l,trControl = trainControl(method = m))
###4 Random forests  cforest party
cforestFit <- train(trainDescr, trainClass, method='cforest', tuneLength = l,trControl = trainControl(method = m))
###5 Bagging  treebag  ipred
treebagFit <- train(trainDescr, trainClass, method='treebag', tuneLength = l,trControl = trainControl(method = m))
###6 Neural networks  nnet  nnet
nnetFit <- train(trainDescr, trainClass, method='nnet', tuneLength = l,trControl = trainControl(method = m))
###7 Support vector machines  svmRadial  kernlab 
svmRadialFit <- train(trainDescr, trainClass, method='svmRadial', tuneLength = l,trControl = trainControl(method = m))
###8 Support vector machines  svmLinear  kernlab
svmLinearFit <- train(trainDescr, trainClass, method='svmLinear', tuneLength = l,trControl = trainControl(method = m))
###9 k nearest neighbors  knn  caret
knnFit <- train(trainDescr, trainClass, method='knn', tuneLength = l,trControl = trainControl(method = m))
###10 Generalized linear model glm  stats
glmFit <- train(trainDescr, trainClass, method='glm', tuneLength = l,trControl = trainControl(method = m))
###11 Logistic/Multinomial Regression multinom  nnet
# I have an if statement here to check if trainDescr is factor
multinomFit <- train(trainDescr, trainClass, method='multinom', tuneLength = l,trControl = trainControl(method = m))

### models
models <- list(rpart=rpartFit, ctree2=ctreeFit, rf=rfFit, cforest=cforestFit, treebag=treebagFit, nnet = nnetFit, svmRadial = svmRadialFit, svmLinear = svmLinearFit, knn = knnFit, glm = glmFit, multinom=multinomFit)

# It takes about 30 hrs of clock time to get to this point
# At this point, models was written to the disk, and the file size is 1.3 GB
save(models,file='models.RData')

  ### predict values
  predValues <- extractPrediction(models, testX = testDescr, testY = testClass)
  testValues <- subset(predValues, dataType == "Test")

  ### predict probability
  probValues <- extractProb(models, testX = testDescr, testY = testClass)
  testProbs <- subset(probValues, dataType == "Test")

# I waited about 3 hours for the above 4 lines, then went home
# The rest of the code may need some correcting

  ############stats
  ###1 rpart
  rpartPred <- subset(testValues, model == "rpart")
  x <- confusionMatrix(rpartPred$pred, rpartPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "rpart")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x1 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  png(filename = oplot, width = pw, height = ph, units = "px")
  plotColors <-colors()[c(9,17,19,24,27,33,51,62,84,254,552)]
  plot(rocCurve, col=plotColors[1], main="", lty=plty)
  legend('bottomright', c('rpart', 'ctree2', 'rf', 'cforest', 'treebag', 'nnet', 'svmRadial', 'svmLinear', 'knn', 'glm', 'multinom'), pch=c(15), col=plotColors)

  ###2 ctree2
  ctreePred <- subset(testValues, model == "ctree2")
  x=confusionMatrix(ctreePred$pred, ctreePred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "ctree2")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x2 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[2], main="", lty=plty, add=TRUE)

  ###3 rf
  rfPred <- subset(testValues, model == "rf")
  x=confusionMatrix(rfPred$pred, rfPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "rf")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x3 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[3], main="", lty=plty, add=TRUE)

  ###4 cforest
  cforestPred <- subset(testValues, model == "cforest")
  x=confusionMatrix(cforestPred$pred, cforestPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "cforest")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x4 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[4], main="", lty=plty, add=TRUE)

  ###5 treebag
  treebagPred <- subset(testValues, model == "treebag")
  x=confusionMatrix(treebagPred$pred, treebagPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "treebag")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x5 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[5], main="", lty=plty, add=TRUE)

  ###6 nnet
  nnetPred <- subset(testValues, model == "nnet")
  x=confusionMatrix(nnetPred$pred, nnetPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "nnet")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x6 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[6], main="", lty=plty, add=TRUE)

  ###7 svmRadial
  svmRadialPred <- subset(testValues, model == "svmRadial")
  x=confusionMatrix(svmRadialPred$pred, svmRadialPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
  myProb <- subset(testProbs, model == "svmRadial")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x7 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[7], main="", lty=plty, add=TRUE)

  ###8 svmLinear
  svmLinearPred <- subset(testValues, model == "svmLinear")
  x=confusionMatrix(svmLinearPred$pred, svmLinearPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "svmLinear")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x8 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[8], main="", lty=plty, add=TRUE)

  ###9 knn
  knnPred <- subset(testValues, model == "knn")
  x=confusionMatrix(knnPred$pred, knnPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "knn")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x9 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[9], main="", lty=plty, add=TRUE)

  ###10 glm
  glmPred <- subset(testValues, model == "glm")
  x=confusionMatrix(glmPred$pred, glmPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "glm")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x10 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[10], main="", lty=plty, add=TRUE)

  ###11 multinom
  # Needs if statement to check for factors
  multinomPred <- subset(testValues, model == "multinom")
  x=confusionMatrix(multinomPred$pred, multinomPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "multinom")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x11 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[11], main="", lty=plty, add=TRUE)
  dev.off()
  ### output
  b <- c(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11)
  s <- matrix(b, 11, 9, byrow=T, dimnames=list(c("rpart", "ctree2", "rf", "cforest", "treebag", "nnet", "svmRadial", "svmLinear", "knn", "glm", "multinom"), c("TP", "FN", "FP", "TN", "AUC", "Accuracy", "Sensitivity", "Specificity", "phi")))
  write.table(s, ostats)
  write.table(s)
  return(s)

如何加速数据模型培训/评估？

0 个答案: