假设我有以下数据集
df <- read.table(text="UTCDate UTCTime White Black
2018.01.01 03:49:40 JL XN
2018.01.01 03:52:01 XN JL
2018.01.01 03:54:16 JL XN
2018.01.01 03:55:58 XN JL
2018.01.01 03:57:59 JL XN
2018.01.01 04:00:27 XN JL
2018.01.01 04:01:48 JL XN
2018.01.01 04:03:43 XN JL
2018.01.01 04:06:12 JL XN
2018.01.01 09:21:16 JL OC
2018.01.01 09:22:28 OC JL
2018.01.01 09:24:16 JL OC
2018.01.01 09:24:58 OC JL", header=T)
我正在尝试创建一个列来跟踪“连续游戏”,其中连续游戏定义为距前一个条目少于10分钟的条目。在这里,连续游戏是从某个玩家(例如JL)的角度定义的,因此consec_games列中的第1至9行将为(1:9),第10至13行将为(1:4)。
“连续游戏”列应仅适用于单个玩家(例如JL)。因此,它将是JL_consecutive_games。这是一个大型数据集,其中多个玩家可能同时在玩其他玩家。因此,“连续游戏”列仅适用于某些预先指定的玩家(在本例中为JL)。
我看到了这样的答案:Consecutive count by groups利用了滞后函数,但是我不确定如何将滞后应用于此数据集。
答案 0 :(得分:5)
仅在基数R中。
首先使用两个独立的列UTCDate
和UTCTime
制作一个数据/时间对象。然后使用cumsum
技巧来获取组。最后,ave
将函数(seq_along
)应用于d
定义的每个向量。
UTC <- paste(df$UTCDate, df$UTCTime)
UTC <- as.POSIXct(UTC, format = "%Y.%m.%d %H:%M:%S")
d <- c(0, difftime(UTC[-1], UTC[-length(UTC)], units = "mins"))
d <- cumsum(d > 10)
ave(seq_len(nrow(df)), d, FUN = seq_along)
#[1] 1 2 3 4 5 6 7 8 9 1 2 3 4
df$Games <- ave(seq_len(nrow(df)), d, FUN = seq_along)
rm(UTC, d) # Tidy up
答案 1 :(得分:2)
便捷data.table
功能怎么样:
consecutive_plays <- function(df, player, date_var = "UTCDate", time_var = "UTCTime", white_var = "White", black_var = "Black", diff_time = 10, unit_time = "mins") {
require(data.table)
setDT(df)[, `:=` (TimeDimension = as.POSIXct(paste(get(date_var), get(time_var)), format = "%Y.%m.%d %H:%M:%S"),
Player_Present = get(white_var) == player | get(black_var) == player)][
, time_diff := as.numeric(difftime(TimeDimension, shift(TimeDimension), units = unit_time) >= diff_time), by = .(Player_Present)][
is.na(time_diff), time_diff := 0][
get(white_var) == player | get(black_var) == player, paste0(player, "_consecutive_games") := seq_len(.N), by = .(Player_Present, cumsum(time_diff))][
, c("TimeDimension", "time_diff", "Player_Present") := NULL
]
}
您可以呼叫任何所需的玩家:
df <- consecutive_plays(df, player = "JL")
并获得相应列的输出:
UTCDate UTCTime White Black JL_consecutive_games
1: 2018.01.01 03:49:40 JL XN 1
2: 2018.01.01 03:52:01 XN JL 2
3: 2018.01.01 03:54:16 JL XN 3
4: 2018.01.01 03:55:58 XN JL 4
5: 2018.01.01 03:57:59 JL XN 5
6: 2018.01.01 04:00:27 XN JL 6
7: 2018.01.01 04:01:48 JL XN 7
8: 2018.01.01 04:03:43 XN JL 8
9: 2018.01.01 04:06:12 JL XN 9
10: 2018.01.01 09:21:16 JL OC 1
11: 2018.01.01 09:22:28 OC JL 2
12: 2018.01.01 09:24:16 JL OC 3
13: 2018.01.01 09:24:58 OC JL 4
如果玩家没有参加任何一场比赛,则这些行将设置为NA
:
df <- consecutive_plays(df, player = "XN")
UTCDate UTCTime White Black JL_consecutive_games XN_consecutive_games
1: 2018.01.01 03:49:40 JL XN 1 1
2: 2018.01.01 03:52:01 XN JL 2 2
3: 2018.01.01 03:54:16 JL XN 3 3
4: 2018.01.01 03:55:58 XN JL 4 4
5: 2018.01.01 03:57:59 JL XN 5 5
6: 2018.01.01 04:00:27 XN JL 6 6
7: 2018.01.01 04:01:48 JL XN 7 7
8: 2018.01.01 04:03:43 XN JL 8 8
9: 2018.01.01 04:06:12 JL XN 9 9
10: 2018.01.01 09:21:16 JL OC 1 NA
11: 2018.01.01 09:22:28 OC JL 2 NA
12: 2018.01.01 09:24:16 JL OC 3 NA
13: 2018.01.01 09:24:58 OC JL 4 NA
拥有多少玩家并不重要。您可以轻松地快速获取所有列:
players <- unique(c(as.character(df$White), as.character(df$Black)))
for (player in players) { df <- consecutive_plays(df, player = player) }
输出:
UTCDate UTCTime White Black JL_consecutive_games XN_consecutive_games OC_consecutive_games
1: 2018.01.01 03:49:40 JL XN 1 1 NA
2: 2018.01.01 03:52:01 XN JL 2 2 NA
3: 2018.01.01 03:54:16 JL XN 3 3 NA
4: 2018.01.01 03:55:58 XN JL 4 4 NA
5: 2018.01.01 03:57:59 JL XN 5 5 NA
6: 2018.01.01 04:00:27 XN JL 6 6 NA
7: 2018.01.01 04:01:48 JL XN 7 7 NA
8: 2018.01.01 04:03:43 XN JL 8 8 NA
9: 2018.01.01 04:06:12 JL XN 9 9 NA
10: 2018.01.01 09:21:16 JL OC 1 NA 1
11: 2018.01.01 09:22:28 OC JL 2 NA 2
12: 2018.01.01 09:24:16 JL OC 3 NA 3
13: 2018.01.01 09:24:58 OC JL 4 NA 4
最终,您还可以配置其他参数,例如如果您想将diff_time
更改为10分钟以上,或者要将unit_time
更改为hours
,days
或weeks
,等等
答案 2 :(得分:1)
这是使用tidyverse的解决方案,可为您提供所有玩家的结果:
library(tidyverse)
library(magrittr)
library(lubridate)
df %<>%
gather("color", "player", 3:4) %>%
unite(datetime, 1:2) %>%
arrange(datetime) %>%
mutate(name = player)
df$datetime %<>% parse_datetime("%Y.%m.%d_%H:%M:%S")
nested <- df %>%
mutate(cond = difftime(df$datetime, lag(df$datetime)) < 600) %>%
group_by(player) %>%
nest()
get_cons <- function(df) {
df$consecutive_games[1] <- 1
for(i in 2:nrow(df)) {
if(df$cond[i] == T) {
df$consecutive_games[i] <- df$consecutive_games[i - 1] + 1
}
df$cond[i] <- 1
}
df %>%
select(- cond)
}
options(tibble.print_max = Inf) # to show entire tibble
map_df(nested$data, get_cons)
# A tibble: 26 x 4
datetime color name consecutive_games
<dttm> <chr> <chr> <dbl>
1 2018-01-01 03:49:40 White JL 1
2 2018-01-01 03:52:01 Black JL 2
3 2018-01-01 03:54:16 White JL 3
4 2018-01-01 03:55:58 Black JL 4
5 2018-01-01 03:57:59 White JL 5
6 2018-01-01 04:00:27 Black JL 6
7 2018-01-01 04:01:48 White JL 7
8 2018-01-01 04:03:43 Black JL 8
9 2018-01-01 04:06:12 White JL 9
10 2018-01-01 09:21:16 White JL 1
11 2018-01-01 09:22:28 Black JL 2
12 2018-01-01 09:24:16 White JL 3
13 2018-01-01 09:24:58 Black JL 4
14 2018-01-01 03:49:40 Black XN 1
15 2018-01-01 03:52:01 White XN 2
16 2018-01-01 03:54:16 Black XN 3
17 2018-01-01 03:55:58 White XN 4
18 2018-01-01 03:57:59 Black XN 5
19 2018-01-01 04:00:27 White XN 6
20 2018-01-01 04:01:48 Black XN 7
21 2018-01-01 04:03:43 White XN 8
22 2018-01-01 04:06:12 Black XN 9
23 2018-01-01 09:21:16 Black OC 1
24 2018-01-01 09:22:28 White OC 2
25 2018-01-01 09:24:16 Black OC 3
26 2018-01-01 09:24:58 White OC 4
如果您喜欢宽格式的话:
map_df(nested$data, get_cons) %>%
rownames_to_column(var = "id") %>%
mutate_at(vars(id), funs(stringi::stri_pad_left(., width = 2, pad = "0"))) %>%
spread(name, consecutive_games)
# A tibble: 26 x 6
id datetime color JL OC XN
<chr> <dttm> <chr> <dbl> <dbl> <dbl>
1 01 2018-01-01 03:49:40 White 1 NA NA
2 02 2018-01-01 03:52:01 Black 2 NA NA
3 03 2018-01-01 03:54:16 White 3 NA NA
4 04 2018-01-01 03:55:58 Black 4 NA NA
5 05 2018-01-01 03:57:59 White 5 NA NA
6 06 2018-01-01 04:00:27 Black 6 NA NA
7 07 2018-01-01 04:01:48 White 7 NA NA
8 08 2018-01-01 04:03:43 Black 8 NA NA
9 09 2018-01-01 04:06:12 White 9 NA NA
10 10 2018-01-01 09:21:16 White 1 NA NA
11 11 2018-01-01 09:22:28 Black 2 NA NA
12 12 2018-01-01 09:24:16 White 3 NA NA
13 13 2018-01-01 09:24:58 Black 4 NA NA
14 14 2018-01-01 03:49:40 Black NA NA 1
15 15 2018-01-01 03:52:01 White NA NA 2
16 16 2018-01-01 03:54:16 Black NA NA 3
17 17 2018-01-01 03:55:58 White NA NA 4
18 18 2018-01-01 03:57:59 Black NA NA 5
19 19 2018-01-01 04:00:27 White NA NA 6
20 20 2018-01-01 04:01:48 Black NA NA 7
21 21 2018-01-01 04:03:43 White NA NA 8
22 22 2018-01-01 04:06:12 Black NA NA 9
23 23 2018-01-01 09:21:16 Black NA 1 NA
24 24 2018-01-01 09:22:28 White NA 2 NA
25 25 2018-01-01 09:24:16 Black NA 3 NA
26 26 2018-01-01 09:24:58 White NA 4 NA