我想转换这样的转码
library(tidyverse)
library(lubridate)
turndata_wide <- tibble(turnID = 1:4,
code = c("a", "b", "a", "g"),
start = c(ymd_hms("2019_05_25 00:00:05"),
ymd_hms("2019_05_25 00:00:02"),
ymd_hms("2019_05_25 00:00:10"),
ymd_hms("2019_05_25 00:00:01")),
end = c(ymd_hms("2019_05_25 00:00:08"),
ymd_hms("2019_05_25 00:00:07"),
ymd_hms("2019_05_25 00:00:15"),
ymd_hms("2019_05_25 00:00:25")))
这会导致
> turndata_wide
# A tibble: 4 x 4
turnID code start end
<int> <chr> <dttm> <dttm>
1 1 a 2019-05-25 00:00:05 2019-05-25 00:00:08
2 2 b 2019-05-25 00:00:02 2019-05-25 00:00:07
3 3 a 2019-05-25 00:00:10 2019-05-25 00:00:15
4 4 g 2019-05-25 00:00:01 2019-05-25 00:00:25
我们称之为(社会科学家)的时间码。看起来应该像
# A tibble: 25 x 4
time a b g
<dttm> <dbl> <dbl> <dbl>
1 2019-05-25 00:00:01 NA NA 1
2 2019-05-25 00:00:02 NA 1 1
3 2019-05-25 00:00:03 NA 1 1
4 2019-05-25 00:00:04 NA 1 1
5 2019-05-25 00:00:05 1 1 1
6 2019-05-25 00:00:06 1 1 1
7 2019-05-25 00:00:07 1 1 1
8 2019-05-25 00:00:08 1 NA 1
9 2019-05-25 00:00:09 NA NA 1
10 2019-05-25 00:00:10 1 NA 1
# … with 15 more rows
我已经构建了一个(行人和丑陋的)解决方案,但是可以肯定的是,还有更好的解决方案。我的(丑陋的)方法是:
## Loop over steps 1) + 2) ########################################
df_per_turn_list <- list()
for(i in 1:nrow(turndata_wide)){
data_turn_temp <- turndata_wide[i,]%>%
gather(startend, time, start, end)%>%
full_join(.,
tibble(time = seq.POSIXt(from = min(.$time),
to = max(.$time),
by = "sec"),
code = .$code[1],
turnID = .$turnID[1]))%>%
select(-startend)%>%
arrange(time)
temp_name <- paste("data_turn_", i, sep = "")
df_per_turn_list[[temp_name]] <- data_turn_temp
}
## Steps 3) + 4): Join dfs_per turn and spread codes ########
reduce(df_per_turn_list, full_join)%>%
mutate(dummy_one = 1)%>%
select(-turnID)%>%
spread(code, dummy_one)%>%
arrange(time)
答案 0 :(得分:1)
使用tidyverse
中的cSplit_e
和splitstackshape
的一种方法。我们每秒在start
和end
之间创建一个序列,每秒group_by
创建一个序列,并将其转换为逗号分隔的值,然后使用cSplit_e
将它们转换为二进制列。
library(tidyverse)
turndata_wide %>%
mutate(time = map2(start, end, seq, by = "1 sec")) %>%
unnest(cols = time) %>%
select(-start, -end) %>%
group_by(time) %>%
summarise(code = toString(code)) %>%
splitstackshape::cSplit_e("code", type = "character", drop = TRUE)
返回的输出为:
# time code_a code_b code_g
#1 2019-05-25 00:00:01 NA NA 1
#2 2019-05-25 00:00:02 NA 1 1
#3 2019-05-25 00:00:03 NA 1 1
#4 2019-05-25 00:00:04 NA 1 1
#5 2019-05-25 00:00:05 1 1 1
#6 2019-05-25 00:00:06 1 1 1
#7 2019-05-25 00:00:07 1 1 1
#8 2019-05-25 00:00:08 1 NA 1
#9 2019-05-25 00:00:09 NA NA 1
#10 2019-05-25 00:00:10 1 NA 1
#11 2019-05-25 00:00:11 1 NA 1
#12 2019-05-25 00:00:12 1 NA 1
#13 2019-05-25 00:00:13 1 NA 1
#14 2019-05-25 00:00:14 1 NA 1
#15 2019-05-25 00:00:15 1 NA 1
#16 2019-05-25 00:00:16 NA NA 1
#17 2019-05-25 00:00:17 NA NA 1
#18 2019-05-25 00:00:18 NA NA 1
#19 2019-05-25 00:00:19 NA NA 1
#20 2019-05-25 00:00:20 NA NA 1
#21 2019-05-25 00:00:21 NA NA 1
#22 2019-05-25 00:00:22 NA NA 1
#23 2019-05-25 00:00:23 NA NA 1
#24 2019-05-25 00:00:24 NA NA 1
#25 2019-05-25 00:00:25 NA NA 1