使用dplyr通过存储在第二个数据帧中的32个条件过滤数据帧

时间:2019-06-13 17:15:38

标签: r dplyr

让我在这里直接研究一个可重复的示例:

以下是每个团队要满足的“拥有”条件的数据框:

structure(list(conferenceId = c("A10", "AAC", "ACC", "AE", "AS", 
"BIG10", "BIG12", "BIGEAST", "BIGSKY", "BIGSOUTH", "BIGWEST", 
"COLONIAL", "CUSA", "HORIZON", "IVY", "MAAC", "MAC", "MEAC", 
"MVC", "MWC", "NE", "OVC", "PAC12", "PATRIOT", "SEC", "SOUTHERN", 
"SOUTHLAND", "SUMMIT", "SUNBELT", "SWAC", "WAC", "WCC"), values = c(25.5, 
33.625, 57.65, 16, 20.9, 48.55, 63.9, 45, 17.95, 28, 11, 24.4, 
23.45, 10.5, 16, 12.275, 31.5, 10.95, 21.425, 36.8999999999999, 
31.025, 18.1, 23.7, 19.675, 52.9999999999997, 24.5, 15, 27.5, 
12.6, 17.75, 13, 33)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -32L))

> head(poss_quantiles)
# A tibble: 6 x 2
  conferenceId values
  <chr>         <dbl>
1 A10            25.5
2 AAC            33.6
3 ACC            57.6
4 AE             16  
5 AS             20.9
6 BIG10          48.5

我的主要数据帧如下:

> head(stats_df)
# A tibble: 6 x 8
  season teamId teamName     teamMarket    conferenceName conferenceId possessions games
  <chr>  <chr>  <chr>        <chr>         <chr>          <chr>              <dbl> <int>
1 1819   AFA    Falcons      Air Force     Mountain West  MWC                 75       2
2 1819   AKR    Zips         Akron         Mid-American   MAC                 46       3
3 1819   ALA    Crimson Tide Alabama       Southeastern   SEC                 90.5     6
4 1819   ARK    Razorbacks   Arkansas      Southeastern   SEC                 71.5     5
5 1819   ARK    Razorbacks   Arkansas      Southeastern   SEC                 42.5     5
6 1819   ASU    Sun Devils   Arizona State Pacific 12     PAC12               91.5     7e: 6 x 8


> dim(stats_df)
[1] 6426  500

我需要过滤主数据帧stats_df,以使每个会议的资产都大于poss_quantiles数据帧中各自的资产值。我正在努力找出使用dplyr的最佳方法。

感谢您的帮助!

1 个答案:

答案 0 :(得分:1)

我相信以下是这个问题的要求。
我已经组成了一个数据集来测试代码。发表在最后。

library(dplyr)

stats_df %>%
  inner_join(poss_quantiles) %>%
  filter(possessions > values) %>%
  select(-values) %>%
  left_join(stats_df)  
#  conferenceId possessions   otherCol oneMoreCol
#1            s   119.63695 -1.2519859  1.3853352
#2            d    82.68660 -0.4968500  0.1954866
#3            b   103.58936 -1.0149620  0.9405918
#4            o   139.69607 -0.1623095  0.4832004
#5            q    76.06736  0.5630558  0.1319336
#6            x    86.19777 -0.7733534  2.3939706
#7            p   135.80127 -1.1578085  0.2037951
#8            t   136.05944  1.7770844  0.5145781

数据创建代码。

set.seed(1234)
poss_quantiles <- data.frame(conferenceId = letters[sample(26, 20)],
                             values = runif(20, 50, 100),
                             stringsAsFactors = FALSE)

stats_df <- data.frame(conferenceId = letters[sample(26, 20)],
                       possessions = runif(20, 10, 150),
                       otherCol = rnorm(20),
                       oneMoreCol = rexp(20),
                       stringsAsFactors = FALSE)