如何使用r进行多分类并行运行knn算法

时间:2016-11-23 17:31:17

标签: r parallel-processing knn

我有一个多分类问题,我正在尝试运行KNN算法来找到每个数据点周围的50个最近邻居。我在R中使用了FNN包,但是由于我的数据集有大约2900万行,所以需要很长时间。我想知道R中是否有一个可以并行运行KNN的软件包。您对其使用示例有什么建议吗?

1 个答案:

答案 0 :(得分:1)

you can use the following by modifying it accordig to KNN .. If need i will provided you with exact code .. the following code is for svc





pkgs <- c('foreach', 'doParallel')

lapply(pkgs, require, character.only = T)

registerDoParallel(cores = 4)

### PREPARE FOR THE DATA ###

df1 <- read.csv(...... your dataset path........)

## do normalization if needed ##


### SPLIT DATA INTO K FOLDS ###
set.seed(2016)

df1$fold <- caret::createFolds(1:nrow(df1), k = 10, list = FALSE)


### PARAMETER LIST ###
cost <- 10^(-1:4)

gamma <- 2^(-4:-1)

parms <- expand.grid(cost = cost, gamma = gamma)

### LOOP THROUGH PARAMETER VALUES ###
result <- foreach(i = 1:nrow(parms), .combine = rbind) %do% {

  c <- parms[i, ]$cost

  g <- parms[i, ]$gamma

  ### K-FOLD VALIDATION ###

  out <- foreach(j = 1:max(df1$fold), .combine = rbind, .inorder = FALSE) %dopar% {

deve <- df1[df1$fold != j, ]

    test <- df1[df1$fold == j, ]

   mdl <- e1071::svm(Classification-type-column ~ ., data = deve, type = "C-classification", kernel = "radial", cost = c, gamma = g, probability = TRUE)

    pred <- predict(mdl, test, decision.values = TRUE, probability = TRUE)
    data.frame(y = test$DEFAULT, prob = attributes(pred)$probabilities[, 2])

  }
  ### CALCULATE SVM PERFORMANCE ###

  roc <- pROC::roc(as.factor(out$y), out$prob) 

  data.frame(parms[i, ], roc = roc$auc[1])

}