我有以下data.frame:
>str(customerduration_data)
Classes 'tbl_df', 'tbl' and 'data.frame': 4495 obs. of 4 variables:
$ monthofgateOUT : Ord.factor w/ 4 levels "8"<"9"<"10"<"11": 1 1 1 1 1 1 1 1 1 1 ...
$ dayofgateOUT : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 4 5 1 1 1 1 1 2 2 3 ...
$ timeofgateOUT : Ord.factor w/ 20 levels "3"<"4"<"5"<"6"<..: 13 4 2 3 3 11 15 10 13 14 ...
$ durationCUST_hours: num 95.63 5.73 10.73 10.2 14.4 .
我想使用以下命令将此数据拆分为训练和测试集:
install.packages("caTools")
library (caTools)
set.seed(6)
customerduration_data$spl=sample.split(customerduration_data,SplitRatio=0.7)
但是,运行上述命令后会发生以下错误:
>Error in `$<-.data.frame`(`*tmp*`, spl, value = c(TRUE, FALSE, FALSE, :
replacement has 4 rows, data has 4495
我该如何解决这个问题?
答案 0 :(得分:2)
作为替代方案,您可以使用基础R ,这会产生更快的选项(根据microbenchmark
为3.4 x)并且不需要额外的包:
df$spl <- sample(c(rep(TRUE, floor(0.7*4495)), rep(FALSE, 4495-floor(0.7*4495))), replace = F)
将其拆分为数据集:
df$spl <- sample(c(rep(TRUE, floor(0.7*4495)), rep(FALSE, 4495-floor(0.7*4495))), replace = F)
test_data <- df[df[,'spl'] %in% TRUE, ]
train_data <- df[df[,'spl'] %in% FALSE, ]
答案 1 :(得分:1)
函数sample.split
需要一个向量。这是实现这一目标的简单方法:
library(caTools)
customerduration_data$spl <- sample.split(seq_len(nrow(customerduration_data)),
SplitRatio = 0.7)
答案 2 :(得分:1)
您正在原始data.frame中创建索引列。如果您想将df分成两组train
和test
,则可以执行以下操作。
library(caTools)
set.seed(6) # make the results reproducible
inx <- sample.split(seq_len(nrow(customerduration_data)), 0.7)
train <- customerduration_data[inx, ]
test <- customerduration_data[!inx, ]
这将不创建列spl
。要创建它,请使用@RalfStubner的答案。
编辑。
另一种方法是将sample
与概率向量一起使用。
inx2 <- sample(c(FALSE, TRUE), 4495, replace = TRUE, prob = c(0.3, 0.7))
到目前为止测试三种解决方案,我得到以下结果。
microbenchmark::microbenchmark(
base_griffinevo = sample(c(rep(TRUE, floor(0.7*4495)), rep(FALSE, 4495-floor(0.7*4495))), replace = F),
base_Rui = sample(c(FALSE, TRUE), 4495, replace = TRUE, prob = c(0.3, 0.7)),
caTools_Ralf = sample.split(seq_len(nrow(customerduration_data)), 0.7)
)
#Unit: microseconds
# expr min lq mean median uq max neval
# base_griffinevo 177.072 183.7665 219.3547 195.147 239.6660 523.851 100
# base_Rui 89.708 93.2225 119.4083 119.666 134.5615 253.389 100
# caTools_Ralf 838.495 861.4235 1103.0870 926.361 1313.1390 3634.478 100
因此,更简单的基本R方式也是最快的。
答案 3 :(得分:0)
以下是使用caret
包及其createDataPartition()
函数的替代方法。我们将使用Applied Predictive Modeling包中的阿尔茨海默病数据来说明测试和训练数据集的创建。
library(AppliedPredictiveModeling)
library(caret)
data(AlzheimerDisease)
adData <- data.frame(diagnosis, predictors)
# count rows in data frame
nrow(adData)
trainIndex <- createDataPartition(diagnosis, p = .75,list=FALSE)
training <- adData[trainIndex,]
testing <- adData[-trainIndex,]
# rows in training data frame
nrow(training)
# rows in testing data frame
nrow(testing)
...和输出:
> library(AppliedPredictiveModeling)
> library(caret)
Loading required package: lattice
Loading required package: ggplot2
> data(AlzheimerDisease)
> adData <- data.frame(diagnosis, predictors)
> # count rows in data frame
> nrow(adData)
[1] 333
> trainIndex <- createDataPartition(diagnosis, p = .75,list=FALSE)
> training <- adData[trainIndex,]
> testing <- adData[-trainIndex,]
> # rows in training data frame
> nrow(training)
[1] 251
> # rows in testing data frame
> nrow(testing)
[1] 82
>