Question

我正在尝试对我的数据进行引导，但速度非常慢。我在Windows笔记本电脑上使用R.我有一个数据集，它有多行来表示观察，当我引导我的数据时，这些数据集需要保持在一起。我有一个变量，指示哪一行属于哪个主题，称为VacancyId。我使用this method的变体，因为我的输出遵循泊松分布。我的代码中的真正瓶颈在于lapply函数：

bootSimFun.can <- function(preddata,opreddata,data) {
# sample by VacancyId because of dependencies
samp <- sample(unique(data$VacancyId), replace=TRUE)
# create bootstrapped data
bdata <- bind_rows(lapply(samp, function(x) data[data$VacancyId == x,]))
# remove NA
bdata <- na.omit(bdata)
# create x with the appropriate variables
x <- bdata[,c('VacancyBankId', 'VacancyFunctionId', 
'VacancyEducationLevelId', 'VacancyProvinceId')]
# make sure the variables are seen as categorical
x$FunctionId <- as.factor(x$VacancyFunctionId)
x$EducationLevel <- as.factor(x$VacancyEducationLevelId)
x$ProvinceId <- as.factor(x$VacancyProvinceId)
x$VacancyBankId <- as.factor(x$VacancyBankId)
# allocate outcome
y <- bdata$CandidatesPerWeek
# create dummy matrix
x.onehot <- model.matrix(~ . + 0, data = x)
# create parameters for the xgboost
xgb_params <- list("objective" = "count:poisson",
                 "eval_metric" = "rmse")
# train model
newmodel <- xgboost(data = x.onehot,
                  label = y,
                  nrounds = 10,
                  params = xgb_params)
# make predictions
bpred <- predict(newmodel,type="response",newdata=preddata)
# make predictions for 60 days
bpred <- bpred*(60/7)
# bind the predictions with the original data
bpredictions <- cbind(opreddata, bpred)
# we are interested in the predictions at campaign level, so we sum up by Function, Education, and Province
# which vacancybanks are used is specified in the make.predictions dataframe
aggregate <- aggregate(bpred ~ VacancyFunctionId + VacancyEducationLevelId + 
VacancyProvinceId, data=bpredictions, sum, na.rm=TRUE)
# make sure it is in the right order
aggregate <- aggregate[order(aggregate$VacancyFunctionId, 
aggregate$VacancyEducationLevelId, aggregate$VacancyProvinceId),]
# Generated random numbers based on Poisson distribution with the mean, e.g. 
lambda, equal to the predicted values from refitted models
rpois(length(aggregate$bpred), lambda=aggregate$bpred)
}

特别是lapply函数真的很慢。还有其他选择吗？

我的一小部分数据：

structure(list(VacancyId = structure(c(4L, 3L, 6L, 7L, 3L, 6L, 
6L, 7L, 3L, 4L, 7L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 5L, 5L), .Label = c("57772", 
"57775", "57818", "57820", "57821", "57822", "57871"), class = "factor"), 
VacancyBankId = structure(c(2L, 1L, 2L, 2L, 3L, 1L, 3L, 3L, 
2L, 4L, 6L, 1L, 3L, 7L, 3L, 5L, 1L, 4L, 1L, 2L), .Label = c("2", 
"17", "147", "257", "991", "1565", "1609"), class = "factor"), 
VacancyFunctionId = structure(c(2L, 1L, 1L, 2L, 1L, 1L, 1L, 
2L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 2L, 2L), .Label = c("3", 
"4", "5", "11"), class = "factor"), VacancyEducationLevelId = structure(c(2L, 
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 2L, 2L, 2L), .Label = c("4", "6", "8"), class = "factor"), 
VacancyProvinceId = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("19", 
"21"), class = "factor"), CandidatesPerWeek = c(0, 2, 0, 
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0)), 
.Names = c("VacancyId", "VacancyBankId", "VacancyFunctionId", "VacancyEducationLevelId", "VacancyProvinceId", "CandidatesPerWeek"), 
row.names = c(3L, 9L, 10L, 19L, 20L, 26L, 27L, 33L, 37L, 38L, 56L, 57L, 58L, 69L, 70L, 72L, 73L, 122L, 125L, 128L), class = "data.frame")

结果是CandidatesPerWeek。 VacancyBankId，VacancyFunctionId，VacancyEducationLevelId是VacancyProvinceId是输入。

Answer 1

我转换为data.table，转换为因子，对因子进行采样并对其进行重新排序，并且我的样本数据和100万行表的速度提高了5倍以上你的数据。

我使用此代码：

data2 <- copy(data)
data2 <- setDT(data2)
data2$VacancyId <- factor(data2$VacancyId,sample(levels(data2VacancyId),replace=TRUE))
setorder(data2,VacancyId)

这是基准

data2 <- copy(data)
data2 <- setDT(data2)
data2$VacancyId <- factor(data2$VacancyId)

f1 <- function(data){
  samp <- sample(unique(data$VacancyId), replace=TRUE)
  bdata <- bind_rows(lapply(samp, function(x) data[data$VacancyId == x,]))
  bdata
}

f2 <- function(data2){
  data2$VacancyId <- factor(data2$VacancyId,sample(levels(data2$VacancyId),replace=TRUE))
  setorder(data2,VacancyId)
  data2
}

library(microbenchmark)
microbenchmark(f1(data),f2(data2),times=1000)
# Unit: microseconds
#      expr      min        lq      mean    median       uq      max neval
#  f1(data) 2193.213 2406.9770 2616.2763 2492.2700 2591.189 21471.67  1000
# f2(data2)  308.261  372.3195  452.6593  409.4805  450.889 18877.83  1000

快5倍左右，让我们检查更大的数据（从你的例子中复制100万行）

set.seed(1)
big_data <- data[sample(1:nrow(data),1000000,replace=TRUE),]
big_data2 <- copy(big_data)
big_data2 <- setDT(big_data2)
big_data2$VacancyId <- factor(big_data2$VacancyId)
library(microbenchmark)
microbenchmark(f1(big_data),f2(big_data2),times=50)

# Unit: milliseconds
# expr       min       lq     mean   median       uq      max neval
# f1(big_data) 525.38332 556.3378 598.5809 570.0738 592.2092 899.6736    10
# f2(big_data2)  61.43292  66.3120 124.4283 107.0262 123.4962 374.3961    10

仍然快5倍

Answer 2

发现它！

 samp <- sample(unique(d.9weeks$VacancyId), replace = TRUE)
 datDT <- as.data.table(d.9weeks)
 setkey(datDT, "VacancyId")
 # create bootstrapped data
 bdata <- datDT[J(samp), allow.cartesian = TRUE]

在R

2 个答案: