以this问题为基础
我添加时间因素并通过这样做来复制ID:
sample_df <- data.frame(id = c(14129, 14129, 14129, 29102, 29102, 2191, 2191, 2191, 2191, 2192, 2192, 1912, 1912, 1912)
, date = c("2018-06-15 00:15:42","2018-10-08 12:44:44",
"2018-07-09 18:14:58", "2018-06-15 00:15:40",
"2018-06-15 00:19:42", "2018-10-15 08:17:47",
"2018-09-29 10:16:34", "2018-07-09 18:28:25",
"2018-07-09 18:28:25", "2018-07-09 18:20:32",
"2018-08-30 13:06:45", "2018-10-08 11:32:55",
"2018-10-05 11:32:55", "2018-10-08 09:09:56")
, color = c("blue", "blue", "green", "red", "red", "red", "green", "blue", "green", "purple", "blue", "blue", "red", "red")
, day = c("monday", "monday", "monday", "wednesday", "wednesday", "thursday", "thursday",
"thursday", "thursday", "monday", "monday", "tuesday", "tuesday", "tuesday")
, happy = c(1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
> sample_df
id date color day happy
1 14129 2018-06-15 00:15:42 blue monday 1
2 14129 2018-10-08 12:44:44 blue monday 0
3 14129 2018-07-09 18:14:58 green monday 0
4 29102 2018-06-15 00:15:40 red wednesday 0
5 29102 2018-06-15 00:19:42 red wednesday 1
6 2191 2018-10-15 08:17:47 red thursday 1
7 2191 2018-09-29 10:16:34 green thursday 0
8 2191 2018-07-09 18:28:25 blue thursday 1
9 2191 2018-07-09 18:28:25 green thursday 0
10 2192 2018-07-09 18:20:32 purple monday 0
11 2192 2018-08-30 13:06:45 blue monday 1
12 1912 2018-10-08 11:32:55 blue tuesday 0
13 1912 2018-10-05 11:32:55 red tuesday 0
14 1912 2018-10-08 09:09:56 red tuesday 1
将原始数据框放入此代码后:
sample_df$date <- as.POSIXct(sample_df$date)
sample_df_2 <- sample_df %>%
gather(key, type, color:day) %>%
mutate(happy = case_when(key == "color" ~ 0, TRUE ~ as.numeric(happy))) %>%
select(-key) %>%
arrange(id)
> sample_df_2
id date happy type
1 1912 2018-10-08 11:32:55 0 blue
2 1912 2018-10-05 11:32:55 0 red
3 1912 2018-10-08 09:09:56 0 red
4 1912 2018-10-08 11:32:55 0 tuesday
5 1912 2018-10-05 11:32:55 0 tuesday
6 1912 2018-10-08 09:09:56 1 tuesday
7 2191 2018-10-15 08:17:47 0 red
8 2191 2018-09-29 10:16:34 0 green
9 2191 2018-07-09 18:28:25 0 blue
10 2191 2018-07-09 18:28:25 0 green
11 2191 2018-10-15 08:17:47 1 thursday
12 2191 2018-09-29 10:16:34 0 thursday
13 2191 2018-07-09 18:28:25 1 thursday
14 2191 2018-07-09 18:28:25 0 thursday
15 2192 2018-07-09 18:20:32 0 purple
16 2192 2018-08-30 13:06:45 0 blue
17 2192 2018-07-09 18:20:32 0 monday
18 2192 2018-08-30 13:06:45 1 monday
19 14129 2018-06-15 00:15:42 0 blue
20 14129 2018-10-08 12:44:44 0 blue
21 14129 2018-07-09 18:14:58 0 green
22 14129 2018-06-15 00:15:42 1 monday
23 14129 2018-10-08 12:44:44 0 monday
24 14129 2018-07-09 18:14:58 0 monday
25 29102 2018-06-15 00:15:40 0 red
26 29102 2018-06-15 00:19:42 0 red
27 29102 2018-06-15 00:15:40 0 wednesday
28 29102 2018-06-15 00:19:42 1 wednesday
您会看到,所有day
的{{1}}值都会被复制一次,理想情况下,我想在每个id分组中为dates
保留一行,并且day
是每个相应分组中最早的date
。试图对date
进行过滤,然后切出一行并内部连接原始表-但这很糟。
所需的输出-
day
答案 0 :(得分:0)
使用dplyr::row_number()
,我们可以在将每个组(id,days)标记为0后选择第一行
library(dplyr)
sample_df_2 %>% mutate(Flag=if_else(type %in% c('blue','red','green','purple'),1,0))
%>% group_by(id,Flag) %>% filter(Flag==1 | Flag==0 & row_number()==1)
%>% ungroup() %>% select(-Flag)