我有两个数据帧,一宽一长:
long_df = structure(list(PID = c(1001, 1001, 1001, 1002, 1002, 1002, 1002,
1003), scan_name = c("01_001A", "01_001B", "01_001C", "01_002A",
"01_002B", "01_002D", "01_002E", "01_003B")), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
wide_df = structure(list(PID = c(1001, 1002, 1003), scan_name_1 = c("01_001A",
"01_002A", NA), scan_date_1 = structure(c(1206748800, 1240876800,
NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"), scan_name_2 = c("01_001B",
"01_002B", "01_003B"), scan_date_2 = structure(c(1238544000,
1272672000, 1424736000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
scan_name_3 = c("01_001C", NA, NA), scan_date_3 = structure(c(1301702400,
NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
scan_name_4 = c(NA, "01_002D", NA), scan_date_4 = structure(c(NA,
1400112000, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
scan_name_5 = c(NA, "01_002E", NA), scan_date_5 = structure(c(NA,
1430438400, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
我正在尝试从wide_df到long_df中获取值“ scan_date_1”,“ scan_date_2”等。
我试图获得的输出看起来像这样:
goal_df = structure(list(PID = c(1001, 1001, 1001, 1002, 1002, 1002, 1002,
1003), scan_name = c("01_001A", "01_001B", "01_001C", "01_002A",
"01_002B", "01_002D", "01_002E", "01_003B"), scan_date = structure(c(1206748800,
1238544000, 1301702400, 1240876800, 1272672000, 1400112000, 1430438400,
1424736000), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
看起来很简单,但是我没有使用merge / melt / etc的尝试都能够到达那里。任何帮助都将不胜感激! (第一次使用“ dput”,因此希望这是一个可复制的示例)
答案 0 :(得分:2)
这可以通过data.table的melt
函数及其patterns
参数有效地完成:
library(data.table)
setDT(wide_df)
goal_df <- melt(wide_df, measure = patterns("scan_name", "scan_date"), value.name = c("scan_name", "scan_date"))
goal_df <- na.omit(goal_df)
goal_df[, variable := NULL][]
#> PID scan_name scan_date
#> 1: 1001 01_001A 2008-03-29
#> 2: 1002 01_002A 2009-04-28
#> 3: 1001 01_001B 2009-04-01
#> 4: 1002 01_002B 2010-05-01
#> 5: 1003 01_003B 2015-02-24
#> 6: 1001 01_001C 2011-04-02
#> 7: 1002 01_002D 2014-05-15
#> 8: 1002 01_002E 2015-05-01
注意:tidyr的开发版本(GitHub)包含具有相似功能的新功能pivot_longer
和pivot_wider
,请参见Tidyr-Pivoting。使用pivot_longer
,您可以做到:
library(dplyr)
library(tidyr)
mutate_at(wide_df, .vars = vars(starts_with("scan")), as.character) %>%
pivot_longer(-PID,
names_to = c(".value", "id"),
names_pattern = "(scan_date|scan_name)_(.)",
values_drop_na = TRUE
) %>%
select(-id)
#> # A tibble: 8 x 3
#> PID scan_name scan_date
#> <dbl> <chr> <chr>
#> 1 1001 01_001A 2008-03-29
#> 2 1001 01_001B 2009-04-01
#> 3 1001 01_001C 2011-04-02
#> 4 1002 01_002A 2009-04-28
#> 5 1002 01_002B 2010-05-01
#> 6 1002 01_002D 2014-05-15
#> 7 1002 01_002E 2015-05-01
#> 8 1003 01_003B 2015-02-24
packageVersion("tidyr")
#> ‘0.8.3.9000’
答案 1 :(得分:1)
此代码加长wide_df
,其中每一行按事件代表一个唯一的人。
library(magrittr)
pattern <- "^scan_(date|name)_(\\d+)$"
wide_df %>%
dplyr::mutate_all(as.character) %>%
tidyr::gather(key="key", value="value", -PID) %>%
dplyr::mutate(
event_id = as.integer(sub(pattern, "\\2", key)),
key = sub(pattern, "\\1", key)
) %>%
tidyr::spread(key=key, value=value) %>%
dplyr::mutate(
date = as.Date(date)
)
结果:
# A tibble: 15 x 4
PID event_id date name
<chr> <int> <date> <chr>
1 1001 1 2008-03-29 01_001A
2 1001 2 2009-04-01 01_001B
3 1001 3 2011-04-02 01_001C
4 1001 4 NA NA
5 1001 5 NA NA
6 1002 1 2009-04-28 01_002A
7 1002 2 2010-05-01 01_002B
8 1002 3 NA NA
9 1002 4 2014-05-15 01_002D
10 1002 5 2015-05-01 01_002E
11 1003 1 NA NA
12 1003 2 2015-02-24 01_003B
13 1003 3 NA NA
14 1003 4 NA NA
15 1003 5 NA NA