根据截止年份添加元素和列?

时间:2018-06-06 17:51:31

标签: r

structure(list(`2005` = c(0L, 0L, 0L, 2L, 1L), `2006` = c(0L, 
0L, 0L, 1L, 1L), `2007` = c(1L, 0L, 1L, 0L, 3L), `2008` = c(1L, 
0L, 0L, 4L, 3L), `2009` = c(1L, 0L, 0L, 2L, 3L), `2010` = c(0L, 
0L, 0L, 5L, 0L), `2011` = c(0L, 0L, 0L, 0L, 1L), `2012` = c(0L, 
0L, 0L, 4L, 1L), `2013` = c(1L, 0L, 1L, 0L, 0L), `2014` = c(0L, 
0L, 2L, 0L, 9L), `2015` = c(0L, 0L, 1L, 0L, 2L), `2016` = c(0L, 
0L, 0L, 0L, 0L), Cutoff = c("2011", "2015", "2015", "2005", "2011"
)), .Names = c("2005", "2006", "2007", "2008", "2009", "2010", 
"2011", "2012", "2013", "2014", "2015", "2016", "Cutoff"), row.names = c(NA, 
5L), class = "data.frame")

给出以下数据帧。我想在表格中添加4列。 在截止年份之前添加元素中的数字的一列和在截止年份之后添加元素中的数字的一列。

然后再增加两列,其中一列添加截止前的年/列总数,另一列添加截止后的列。

截止年份不应包括在相应的行中。

所以决赛桌最终看起来像这样:

structure(list(`2005` = c(0L, 0L, 0L, 2L, 1L), `2006` = c(0L, 
0L, 0L, 1L, 1L), `2007` = c(1L, 0L, 1L, 0L, 3L), `2008` = c(1L, 
0L, 0L, 4L, 3L), `2009` = c(1L, 0L, 0L, 2L, 3L), `2010` = c(0L, 
0L, 0L, 5L, 0L), `2011` = c(0L, 0L, 0L, 0L, 1L), `2012` = c(0L, 
0L, 0L, 4L, 1L), `2013` = c(1L, 0L, 1L, 0L, 0L), `2014` = c(0L, 
0L, 2L, 0L, 9L), `2015` = c(0L, 0L, 1L, 0L, 2L), `2016` = c(0L, 
0L, 0L, 0L, 0L), Cutoff = c("2011", "2015", "2015", "2005", "2011"
), Numbers_Before = c(3, 0, 4, 0, 11), Numbers_After = c(1, 0, 
0, 16, 12), Years_Before = c(6, 10, 10, 0, 6), Years_After = c(5, 
1, 1, 11, 5)), .Names = c("2005", "2006", "2007", "2008", "2009", 
"2010", "2011", "2012", "2013", "2014", "2015", "2016", "Cutoff", 
"Numbers_Before", "Numbers_After", "Years_Before", "Years_After"
), row.names = c(NA, 5L), class = "data.frame")

2 个答案:

答案 0 :(得分:1)

我发现首先使用melt更简单地将表格放在一个整洁的格式中然后使用一些data.table操作来计算截止年份之前和之后的年数或数字。

library(data.table)

dt = setDT(structure(list(`2005` = c(0L, 0L, 0L, 2L, 1L), `2006` = c(0L, 
  0L, 0L, 1L, 1L), `2007` = c(1L, 0L, 1L, 0L, 3L), `2008` = c(1L, 
  0L, 0L, 4L, 3L), `2009` = c(1L, 0L, 0L, 2L, 3L), `2010` = c(0L, 
  0L, 0L, 5L, 0L), `2011` = c(0L, 0L, 0L, 0L, 1L), `2012` = c(0L, 
  0L, 0L, 4L, 1L), `2013` = c(1L, 0L, 1L, 0L, 0L), `2014` = c(0L, 
  0L, 2L, 0L, 9L), `2015` = c(0L, 0L, 1L, 0L, 2L), `2016` = c(0L, 
  0L, 0L, 0L, 0L), Cutoff = c("2011", "2015", "2015", "2005", "2011"
  )), .Names = c("2005", "2006", "2007", "2008", "2009", "2010", 
  "2011", "2012", "2013", "2014", "2015", "2016", "Cutoff"), row.names = c(NA, 
  5L), class = "data.frame"))

dt[, row := rownames(dt)]
dt2 = melt(dt, id.vars = c('Cutoff', 'row'), variable.name = 'Year', variable.factor = F)

dt2[, Numbers_Before := ifelse(Year < Cutoff, value, 0)] 
dt2[, Numbers_After := ifelse(Year > Cutoff, value, 0)]
dt2[, Years_Before := ifelse(Year < Cutoff, 1, 0)]
dt2[, Years_After := ifelse(Year > Cutoff, 1, 0)]

dt3 = dt2[, .(Numbers_Before = sum(Numbers_Before), Numbers_After = sum(Numbers_After), 
              Years_Before = sum(Years_Before), Years_After = sum(Years_After)), by = row]

dt = merge(dt,dt3, by = 'row')

> dt
   row 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 Cutoff Numbers_Before Numbers_After Years_Before Years_After
1:   1    0    0    1    1    1    0    0    0    1    0    0    0   2011              3             1            6           5
2:   2    0    0    0    0    0    0    0    0    0    0    0    0   2015              0             0           10           1
3:   3    0    0    1    0    0    0    0    0    1    2    1    0   2015              4             0           10           1
4:   4    2    1    0    4    2    5    0    4    0    0    0    0   2005              0            16            0          11
5:   5    1    1    3    3    3    0    1    1    0    9    2    0   2011             11            12            6           5

编辑: 这里使用了更聪明的数据表语法和dcast而不是ifelses:

dt[, row := rownames(dt)]
dt2 = melt(dt, id.vars = c('Cutoff', 'row'), variable.name = 'Year', variable.factor = F)
dt2 = dt2[Year != Cutoff][, .(Numbers = sum(value), Years = .N), by = .(row, Year > Cutoff, Cutoff)]
dt2 = dcast(dt2, row + Cutoff ~ Year, value.var = c('Numbers', 'Years'), fill = 0)
dt = merge(dt, dt2, by = c('row', 'Cutoff'))

> dt
   row Cutoff 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 Numbers_FALSE
1:   1   2011    0    0    1    1    1    0    0    0    1    0    0    0             3
2:   2   2015    0    0    0    0    0    0    0    0    0    0    0    0             0
3:   3   2015    0    0    1    0    0    0    0    0    1    2    1    0             4
4:   4   2005    2    1    0    4    2    5    0    4    0    0    0    0             0
5:   5   2011    1    1    3    3    3    0    1    1    0    9    2    0            11
   Numbers_TRUE Years_FALSE Years_TRUE
1:            1           6          5
2:            0          10          1
3:            0          10          1
4:           16           0         11
5:           12           6          5

答案 1 :(得分:1)

以下是dplyr方法:

library(dplyr)
library(tidyr)

df1 %>%
  mutate(ID = row_number()) %>%
  gather(var, value, `2005`:`2016`) %>%
  group_by(ID) %>%
  mutate(Numbers_Before = sum(ifelse(var < Cutoff, value, 0)),
         Numbers_After = sum(ifelse(var > Cutoff, value, 0)),
         Years_Before = sum(ifelse(var < Cutoff, 1, 0)),
         Years_After = sum(ifelse(var > Cutoff, 1, 0))) %>%
  spread(var, value) %>%
  arrange(ID)

<强>结果:

  Cutoff ID Numbers_Before Numbers_After Years_Before Years_After 2005 2006 2007 2008 2009 2010
1   2011  1              3             1            6           5    0    0    1    1    1    0
2   2015  2              0             0           10           1    0    0    0    0    0    0
3   2015  3              4             0           10           1    0    0    1    0    0    0
4   2005  4              0            16            0          11    2    1    0    4    2    5
5   2011  5             11            12            6           5    1    1    3    3    3    0
  2011 2012 2013 2014 2015 2016
1    0    0    1    0    0    0
2    0    0    0    0    0    0
3    0    0    1    2    1    0
4    0    4    0    0    0    0
5    1    1    0    9    2    0