Question

我有一些类似以下的数据：

library(sweep)

data <- bike_sales

data$group <- sample(1:4, 15644, replace = TRUE)


data %>% 
  split(.$group)

我想根据这些分组/分组过滤一些数据。

如果price小于1500（即保留所有这些观察值），则过滤列表1

随机抽取列表2

如果price在3000-5000之间，则过滤列表3

如果price大于7000，则过滤列表4

一旦所有这些拆分均被过滤，请使用bind_rows将其重新组合在一起。我只是不知道如何从此处使用map函数开始。

数据：

# A tibble: 3,887 x 18
   order.date order.id order.line quantity price price.ext customer.id bikeshop.name             bikeshop.city bikeshop.state latitude longitude product.id model                          category.primary category.secondary frame   group
   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>       <dbl> <chr>                     <chr>         <chr>             <dbl>     <dbl>      <dbl> <chr>                          <chr>            <chr>              <chr>   <int>
 1 2011-01-10        3          2        1  3200      3200           6 Louisville Race Equipment Louisville    KY                 38.3     -85.8         50 Jekyll Carbon 4                Mountain         Over Mountain      Carbon      4
 2 2011-01-10        3          4        1  5330      5330           6 Louisville Race Equipment Louisville    KY                 38.3     -85.8          4 Supersix Evo Hi-Mod Dura Ace 2 Road             Elite Road         Carbon      4
 3 2011-01-10        3          5        1  1570      1570           6 Louisville Race Equipment Louisville    KY                 38.3     -85.8         34 Synapse Disc 105               Road             Endurance Road     Alumin~     4
 4 2011-01-12        7          5        1  2340      2340           9 Minneapolis Bike Shop     Minneapolis   MN                 45.0     -93.3         87 Habit 5                        Mountain         Trail              Alumin~     4
 5 2011-01-12        7          9        1  3200      3200           9 Minneapolis Bike Shop     Minneapolis   MN                 45.0     -93.3         61 Scalpel-Si 5                   Mountain         Cross Country Race Alumin~     4
 6 2011-01-12        7         10        8  1410     11280           9 Minneapolis Bike Shop     Minneapolis   MN                 45.0     -93.3         18 CAAD8 105                      Road             Elite Road         Alumin~     4
 7 2011-01-17        9          2        8  3200     25600          25 New Orleans Velocipedes   New Orleans   LA                 30.0     -90.1         13 CAAD12 Red                     Road             Elite Road         Alumin~     4
 8 2011-01-18       11          2        8  3200     25600          19 San Francisco Cruisers    San Francisco CA                 37.8    -122.          13 CAAD12 Red                     Road             Elite Road         Alumin~     4
 9 2011-01-18       11          4        1  4500      4500          19 San Francisco Cruisers    San Francisco CA                 37.8    -122.          39 Slice Hi-Mod Dura Ace D12      Road             Triathalon         Carbon      4
10 2011-01-18       11          5        1  7460      7460          19 San Francisco Cruisers    San Francisco CA                 37.8    -122.          23 Synapse Hi-Mod Disc Red        Road             Endurance Road     Carbon      4
# ... with 3,877 more rows

Answer 1

在遵循@ antoine-sac提出的阈值参数的建议之后，我建议为每个组列出参数列表。每个组都有一些元数据：下限（对于没有下限的情况为-Inf），上限（如果没有上限则为Inf）以及是否采样而不是过滤。如果您要采样，则只需进行过滤即可。

library(dplyr)
library(purrr)
library(sweep)

set.seed(1248)
data <- bike_sales
data$group <- sample(1:4, 15644, replace = TRUE)

params <- list(
  `1` = list(low = -Inf, high = 1500, samp = F),
  `2` = list(low = NULL, high = NULL, samp = T),
  `3` = list(low = 3000, high = 5000, samp = F),
  `4` = list(low = 7000, high = Inf, samp = F)
)

data_filtered <- data %>% 
  split(.$group) %>%
  map2(params, function(dat, p) {
    if (p$samp) {
      sample_n(dat, 1)
    } else {
      dat %>%
        filter(between(price, p$low, p$high))
    }
  })

这些很大，所以这里是每个的一小部分：

data_filtered %>% map(~select(., 1:6) %>% head(3))
#> $`1`
#> # A tibble: 3 x 6
#>   order.date order.id order.line quantity price price.ext
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>
#> 1 2011-01-11        5          1        1   480       480
#> 2 2011-01-12        7         10        8  1410     11280
#> 3 2011-01-12        8          1        1  1250      1250
#> 
#> $`2`
#> # A tibble: 1 x 6
#>   order.date order.id order.line quantity price price.ext
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>
#> 1 2012-07-11      522          3        1  6390      6390
#> 
#> $`3`
#> # A tibble: 3 x 6
#>   order.date order.id order.line quantity price price.ext
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>
#> 1 2011-01-11        4          1        1  4800      4800
#> 2 2011-01-18       11          2        8  3200     25600
#> 3 2011-01-18       11          6        1  3200      3200
#> 
#> $`4`
#> # A tibble: 3 x 6
#>   order.date order.id order.line quantity price price.ext
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>
#> 1 2011-01-18       11          5        1  7460      7460
#> 2 2011-01-20       12          9        1  9590      9590
#> 3 2011-01-20       12         19        1  7460      7460

如果要将它们全部重新绑定到单个数据框中，请使用map2_dfr而不是map2。

data %>% 
  split(.$group) %>%
  map2_dfr(params, function(dat, p) { ## <--- change here
    if (p$samp) {
      sample_n(dat, 1)
    } else {
      dat %>%
        filter(between(price, p$low, p$high))
    }
  }) %>%
  head(3)
#> # A tibble: 3 x 18
#>   order.date order.id order.line quantity price price.ext customer.id
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>       <dbl>
#> 1 2011-01-11        5          1        1   480       480           8
#> 2 2011-01-12        7         10        8  1410     11280           9
#> 3 2011-01-12        8          1        1  1250      1250          16
#> # … with 11 more variables: bikeshop.name <chr>, bikeshop.city <chr>,
#> #   bikeshop.state <chr>, latitude <dbl>, longitude <dbl>,
#> #   product.id <dbl>, model <chr>, category.primary <chr>,
#> #   category.secondary <chr>, frame <chr>, group <int>

Answer 2

使用data.table包（不拆分数据）

library(data.table)
setDT(data, key = "group")

fun <- function(x, grp, df) {
  if(grp == 1) df[x < 1500] else
    if(grp == 2) df[sample(nrow(df), 1)] else       # sample one row
      if(grp == 3) df[between(x, 3000, 5000)] else
        if(grp == 4) df[x > 7000]
}

data[, fun(price, .GRP, .SD), group]

Answer 3

考虑 base R，也无需使用transform，merge和subset进行拆分或映射。具体来说，合并到一个单独的数据帧以进行较低/较高范围的分配，以供以后进行过滤。但是对于特殊的第2组采样，需要使用row.names使用必需的对象 grp2_sample ：

grp2_sample <- sample(rownames(bike_sales[bike_sales$group == 2,]), 5)   # SAMPLE OF 5

sub_df <- subset(merge(transform(bike_sales, rn = row.names(bike_sales)),
                       data.frame(group = c(1,3,4),
                                  lower = c(-Inf, 3000, 7000),
                                  upper = c(1500, 5000, Inf)),
                       by ="group", all.x=TRUE),
                 (price >= lower & price <= upper) | (rn %in% grp2_sample)
          )

或者使用 dplyr 和对方mutate，left_join和filter：

library(dplyr)
...
grp2_sample <- sample(rownames(bike_sales[bike_sales$group == 2,]), 5)   # SAMPLE OF 5

sub_df2 <- bike_sales %>%
             mutate(rn = row.names(bike_sales)) %>%
             left_join(data.frame(group = c(1,3,4),
                                  lower = c(-Inf, 3000, 7000),
                                  upper = c(1500, 5000, Inf)), 
                       by="group") %>%
             filter((price >= lower & price <= upper) | (rn %in% grp2_sample))

甚至是 data.table 替代解决方案：

library(data.table)
...
grp2_sample <- sample(rownames(bike_sales[bike_sales$pick == 2,]), 5)    # SAMPLE OF 5

sub_dt <- setDT(bike_sales)[, rn := row.names(bike_sales)][
                            data.table(group = c(1,3,4),
                                       lower = c(-Inf, 3000, 7000),
                                       upper = c(1500, 5000, Inf)), 
                            on="group", 
                            `:=`(lower=i.lower, upper=i.upper)
                           ][(price >= lower & price <= upper) | (rn %in% grp2_sample),]

分割数据框并映射到列表

3 个答案: