R-根据其他列中的组元素数量创建一列

时间:2019-04-07 16:38:10

标签: r dataframe group-by dplyr data.table

假设我有一个数据框“ raw”,其外观如下(简化):

raw <- data.frame(year.start = c(2004, 2004, 2004, 2004, 2004, 2010, 2010, 2010),
            year.end = c(2006, 2006, 2006, 2005, 2005, 2012, 2012, 2012),
            id = c("A","A","A","B","B","C","C","C"))

它包括两列带有年份的列,表示从year.start到year.end的时间段。此外,每个期间都属于一组产品“ id”。我的目标是为每个组分别创建一个新的专栏,将期间分为单个年份。因此目标应如下所示:

goal <- data.frame(year.start = c(2004, 2004, 2004, 2004, 2004, 2010, 2010, 2010),
               year.end = c(2006, 2006, 2006, 2005, 2005, 2012, 2012, 2012),
               id = c("A","A","A","B","B","C","C","C"), 
               year.goal = c(2004, 2005, 2006, 2004, 2005, 2010, 2011, 2012))

有什么流畅的方法可以做到这一点吗?我真的没有头绪...预先感谢!

1 个答案:

答案 0 :(得分:2)

使用dplyr的解决方案。

library(dplyr)

raw2 <- raw %>%
  group_by(id) %>%
  mutate(year.goal = min(year.start):max(year.end)) %>%
  ungroup()
raw2
# # A tibble: 8 x 4
#   year.start year.end id    year.goal
#        <dbl>    <dbl> <fct>     <int>
# 1       2004     2006 A          2004
# 2       2004     2006 A          2005
# 3       2004     2006 A          2006
# 4       2004     2005 B          2004
# 5       2004     2005 B          2005
# 6       2010     2012 C          2010
# 7       2010     2012 C          2011
# 8       2010     2012 C          2012

使用data.table的解决方案。

library(data.table)
setDT(raw)

raw2 <- raw[, year.goal :=  min(year.start):max(year.end), by = id]
raw2[]
#    year.start year.end id year.goal
# 1:       2004     2006  A      2004
# 2:       2004     2006  A      2005
# 3:       2004     2006  A      2006
# 4:       2004     2005  B      2004
# 5:       2004     2005  B      2005
# 6:       2010     2012  C      2010
# 7:       2010     2012  C      2011
# 8:       2010     2012  C      2012

使用基数R的解决方案。

dat_list <- split(raw, f = raw$id)
dat_list2 <- lapply(dat_list, function(x) {
  x$year.goal <- x$year.start[1]:x$year.end[1]
  return(x)
})
raw2 <- do.call(rbind, dat_list2)
raw2
#     year.start year.end id year.goal
# A.1       2004     2006  A      2004
# A.2       2004     2006  A      2005
# A.3       2004     2006  A      2006
# B.4       2004     2005  B      2004
# B.5       2004     2005  B      2005
# C.6       2010     2012  C      2010
# C.7       2010     2012  C      2011
# C.8       2010     2012  C      2012

使用tidyverse的解决方案。

library(tidyverse)

raw2 <- raw %>%
  group_by_all() %>%
  nest() %>%
  mutate(year.goal = map2(year.start, year.end, `:`)) %>%
  unnest()
raw2
# # A tibble: 8 x 4
#   year.start year.end id    year.goal
#        <dbl>    <dbl> <fct>     <int>
# 1       2004     2006 A          2004
# 2       2004     2006 A          2005
# 3       2004     2006 A          2006
# 4       2004     2005 B          2004
# 5       2004     2005 B          2005
# 6       2010     2012 C          2010
# 7       2010     2012 C          2011
# 8       2010     2012 C          2012

另一个dplyr解决方案。

library(dplyr)
raw2 <- raw %>%
  group_by(id) %>%
  mutate(year.goal = first(year.start) + row_number() - 1) %>%
  ungroup()
raw2
# # A tibble: 8 x 4
#   year.start year.end id    year.goal
#        <dbl>    <dbl> <fct>     <dbl>
# 1       2004     2006 A          2004
# 2       2004     2006 A          2005
# 3       2004     2006 A          2006
# 4       2004     2005 B          2004
# 5       2004     2005 B          2005
# 6       2010     2012 C          2010
# 7       2010     2012 C          2011
# 8       2010     2012 C          2012