Question

我想将数据框拆分为火车和测试集。我尝试了以下方法：

library(caret)
train.index <- createDataPartition(dataframe$id, p = .81, list = FALSE)
#createDataPartition does half of what I want

我也尝试过时间片，但我想要的解决方案都不是。我需要在数据框中为训练和测试集获得相同百分比的每个id。上面的代码有效但这里出现了我的问题：我需要拆分多个连续的行，所以正常的随机拆分对我来说不起作用。我需要总是将特定数量的行作为一个测试用例，因为我想使用神经网络来识别id。

示例数据框（注意：所有..实际上都已填充）：

id   bla  blu  and  so  on
 1   ...  ...  ...  ..  ..
 1   ...  ...  ...  ..  ..
 1   ...  ...  ...  ..  3
 1   ...  ...  ...  ..  2
 1   ...  ...  ...  ..  ..
 1   ...  ...  ...  ..  ..
 1   ...  ...  ...  ..  1
 1   ...  ...  ...  ..  1
 2   ...  ...  ...  ..  ..
 2   ...  ...  ...  ..  1
 2   ...  ...  ...  ..  3
 2   ...  ...  ...  ..  ..
 3   ...  ...  ...  ..  ..
 3   ...  ...  ...  ..  33
 3   ...  ...  ...  ..  16
 3   ...  ...  ...  ..  ..
 3   ...  ...  ...  ..  ..

测试集的解决方案（其余是训练集）：

id   bla  blu  and  so  on

 1   ...  ...  ...  ..  3
 1   ...  ...  ...  ..  2
 1   ...  ...  ...  ..  1
 1   ...  ...  ...  ..  1
 2   ...  ...  ...  ..  1
 2   ...  ...  ...  ..  3
 3   ...  ...  ...  ..  33
 3   ...  ...  ...  ..  16

所以，我从我的数据集中获得了长度为2（每个）的随机块。那些是连续的行。

Answer 1

如果目标是按ID随机分区数据，可以按ID拆分数据，应用createDataPartition()函数，并将拆分数据重新组合成测试和训练数据集，如下所示。

 # split IDs and sample half to test and half to training
 # create 450 rows of random data with non-random ID assignment
 df1 <- as.data.frame(matrix(runif(1000),nrow=100,ncol=10))
 df1$id <- 1:100
 df2 <- as.data.frame(matrix(runif(3500),nrow=350,ncol=10))
 df2$id <- c(1:100,rep(1:2,125))
 theData <- rbind(df1,df2)
 theData$id <- as.factor(theData$id)
 dataList <- split(theData,theData$id)
 library(caret)
 set.seed(950146187)
 trainAndTest <- lapply(dataList,function(x){
      trainIndex <- createDataPartition(x$id,p=.5,list=FALSE)
      training <- x[trainIndex,]
      testing <- x[-trainIndex,]
      # return a list() containing both test and train data frames
      list(training=training,testing=testing)
 })
 # combine training sets to one data frame
 training <- do.call(rbind,lapply(trainAndTest,function(x){x[["training"]]}))
 # combine testing data sets to one data frame
 testing <- do.call(rbind,lapply(trainAndTest,function(x){x[["testing"]]}))
 # show approximately 50% of 450 are in each data set
 nrow(testing)
 nrow(training)

...和输出：

> nrow(testing)
[1] 224
> nrow(training)
[1] 226
>

Answer 2

这是一个解决方案，涉及@topepo对tidyverse家庭rsample的最新添加：

library(dplyr)
library(tidyr)
library(purrr)
library(rsample)

test_window <- 2

ordered_mtcars <- mtcars %>% tibble::rownames_to_column() %>% 
  # lets assume cyl represents group id, as in your example
  rename(id=cyl) %>% 
  arrange(id) %>% group_by(id) %>% 
  # we will generate sequence id to mimic the "time" aspect of data
  mutate(ordr=seq(n())) %>% ungroup()

我们将使用mtcars数据集进行少量修改来说明抽样方法

samples_df <- ordered_mtcars %>% 
  group_by(id) %>% nest() %>% 
  # we will generate a bunch of samples the size of target window. 
  # initial = 1 ensures that we have samples from every "time segment" of the data
  # skip ensures samples are non-overlapping
  mutate(idx=map(data, rolling_origin, initial=1, assess=test_window, skip=test_window-1)) %>% 
  # we are only interested in "testing" samples
  unnest(idx) %>% mutate(r_test=map(splits, testing))

  head(samples_df) 
#> # A tibble: 6 x 4
#>      id       splits    id1            r_test
#>   <dbl>       <list>  <chr>            <list>
#> 1     4 <S3: rsplit> Slice1 <tibble [2 x 12]>
#> 2     4 <S3: rsplit> Slice2 <tibble [2 x 12]>
#> 3     4 <S3: rsplit> Slice3 <tibble [2 x 12]>
#> 4     4 <S3: rsplit> Slice4 <tibble [2 x 12]>
#> 5     4 <S3: rsplit> Slice5 <tibble [2 x 12]>
#> 6     6 <S3: rsplit> Slice1 <tibble [2 x 12]>

让我们计算sample_df的哪个部分由1个采样窗口（按组）表示。我们将对这么多组进行抽样，以确保我们的小组中至少有一组采样。

frac <- samples_df %>% group_by(id) %>%
  summarise(frac=1/n()) %>% pull(frac) %>% max

  # here we are sampling bunches per group, so that we draw exactly 1 bunch from the smallest group
ordered_mtcars_test <- samples_df %>% 
  group_by(id) %>% 
  sample_frac(size = frac) %>% 
  unnest(r_test) %>% 
  arrange(id, ordr) %>% ungroup() %>% 
  select(names(ordered_mtcars))

  ordered_mtcars_test
#> # A tibble: 10 x 13
#>              rowname   mpg    id  disp    hp  drat    wt  qsec    vs    am  gear  carb  ordr
#>                <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#>  1    Toyota Corolla  33.9     4  71.1    65  4.22 1.835 19.90     1     1     4     1     6
#>  2     Toyota Corona  21.5     4 120.1    97  3.70 2.465 20.01     1     0     3     1     7
#>  3      Lotus Europa  30.4     4  95.1   113  3.77 1.513 16.90     1     1     5     2    10
#>  4        Volvo 142E  21.4     4 121.0   109  4.11 2.780 18.60     1     1     4     2    11
#>  5         Merc 280C  17.8     6 167.6   123  3.92 3.440 18.90     1     0     4     4     6
#>  6      Ferrari Dino  19.7     6 145.0   175  3.62 2.770 15.50     0     1     5     6     7
#>  7 Chrysler Imperial  14.7     8 440.0   230  3.23 5.345 17.42     0     0     3     4     8
#>  8  Dodge Challenger  15.5     8 318.0   150  2.76 3.520 16.87     0     0     3     2     9
#>  9  Pontiac Firebird  19.2     8 400.0   175  3.08 3.845 17.05     0     0     3     2    12
#> 10    Ford Pantera L  15.8     8 351.0   264  4.22 3.170 14.50     0     1     5     4    13

数据的训练部分就是未在测试集中结束的任何内容

ordered_mtcars_train <-  ordered_mtcars %>% 
  anti_join(ordered_mtcars_test, by=c("id", "ordr"))

ordered_mtcars_train
#> # A tibble: 22 x 13
#>           rowname   mpg    id  disp    hp  drat    wt  qsec    vs    am  gear  carb  ordr
#>             <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#>  1     Datsun 710  22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1     1
#>  2      Merc 240D  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2     2
#>  3       Merc 230  22.8     4 140.8    95  3.92 3.150 22.90     1     0     4     2     3
#>  4       Fiat 128  32.4     4  78.7    66  4.08 2.200 19.47     1     1     4     1     4
#>  5    Honda Civic  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2     5
#>  6      Fiat X1-9  27.3     4  79.0    66  4.08 1.935 18.90     1     1     4     1     8
#>  7  Porsche 914-2  26.0     4 120.3    91  4.43 2.140 16.70     0     1     5     2     9
#>  8      Mazda RX4  21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4     1
#>  9  Mazda RX4 Wag  21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4     2
#> 10 Hornet 4 Drive  21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1     3

将数据帧拆分为训练和测试集

2 个答案: