让我在这里直接研究一个可重复的示例:
以下是每个团队要满足的“拥有”条件的数据框:
structure(list(conferenceId = c("A10", "AAC", "ACC", "AE", "AS",
"BIG10", "BIG12", "BIGEAST", "BIGSKY", "BIGSOUTH", "BIGWEST",
"COLONIAL", "CUSA", "HORIZON", "IVY", "MAAC", "MAC", "MEAC",
"MVC", "MWC", "NE", "OVC", "PAC12", "PATRIOT", "SEC", "SOUTHERN",
"SOUTHLAND", "SUMMIT", "SUNBELT", "SWAC", "WAC", "WCC"), values = c(25.5,
33.625, 57.65, 16, 20.9, 48.55, 63.9, 45, 17.95, 28, 11, 24.4,
23.45, 10.5, 16, 12.275, 31.5, 10.95, 21.425, 36.8999999999999,
31.025, 18.1, 23.7, 19.675, 52.9999999999997, 24.5, 15, 27.5,
12.6, 17.75, 13, 33)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -32L))
> head(poss_quantiles)
# A tibble: 6 x 2
conferenceId values
<chr> <dbl>
1 A10 25.5
2 AAC 33.6
3 ACC 57.6
4 AE 16
5 AS 20.9
6 BIG10 48.5
我的主要数据帧如下:
> head(stats_df)
# A tibble: 6 x 8
season teamId teamName teamMarket conferenceName conferenceId possessions games
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int>
1 1819 AFA Falcons Air Force Mountain West MWC 75 2
2 1819 AKR Zips Akron Mid-American MAC 46 3
3 1819 ALA Crimson Tide Alabama Southeastern SEC 90.5 6
4 1819 ARK Razorbacks Arkansas Southeastern SEC 71.5 5
5 1819 ARK Razorbacks Arkansas Southeastern SEC 42.5 5
6 1819 ASU Sun Devils Arizona State Pacific 12 PAC12 91.5 7e: 6 x 8
> dim(stats_df)
[1] 6426 500
我需要过滤主数据帧stats_df
,以使每个会议的资产都大于poss_quantiles数据帧中各自的资产值。我正在努力找出使用dplyr的最佳方法。
感谢您的帮助!
答案 0 :(得分:1)
我相信以下是这个问题的要求。
我已经组成了一个数据集来测试代码。发表在最后。
library(dplyr)
stats_df %>%
inner_join(poss_quantiles) %>%
filter(possessions > values) %>%
select(-values) %>%
left_join(stats_df)
# conferenceId possessions otherCol oneMoreCol
#1 s 119.63695 -1.2519859 1.3853352
#2 d 82.68660 -0.4968500 0.1954866
#3 b 103.58936 -1.0149620 0.9405918
#4 o 139.69607 -0.1623095 0.4832004
#5 q 76.06736 0.5630558 0.1319336
#6 x 86.19777 -0.7733534 2.3939706
#7 p 135.80127 -1.1578085 0.2037951
#8 t 136.05944 1.7770844 0.5145781
数据创建代码。
set.seed(1234)
poss_quantiles <- data.frame(conferenceId = letters[sample(26, 20)],
values = runif(20, 50, 100),
stringsAsFactors = FALSE)
stats_df <- data.frame(conferenceId = letters[sample(26, 20)],
possessions = runif(20, 10, 150),
otherCol = rnorm(20),
oneMoreCol = rexp(20),
stringsAsFactors = FALSE)