将一组的值添加到R中的另一组

时间:2018-11-23 20:41:37

标签: r datatable dplyr tidyr zoo

我有一个问题,如何将组中的值添加到组中的其余元素,然后删除该行。例如:

df <- data.frame(Year=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2),
                 Cluster=c("a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","c","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","d"),
                 Seed=c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,99,99,99,99,99,99),
                 Day=c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1),
                 value=c(5,2,1,2,8,6,7,9,3,5,2,1,2,8,6,55,66,77,88,99,10))

在上面的示例中,我的数据按年,簇,种子和天分组,其中需要基于(年,簇和天)组将seed = 99值添加到上述行,然后删除该行。例如:#16行是(Year = 1,Cluster = a,Day = 1 and Seed = 99)组的一部分,第16行的值为55应该添加到第1行(5 + 55) ,第6行(6 + 55)和第11行(2 + 55)和第16行应被删除。但是,当涉及到第21行时,它位于cluster = C中且seed = 99,应该按原样保留在数据库中,因为在year + cluster + day组合中找不到任何匹配项。

我的实际数据是100万条记录,这些记录具有10年,80个簇,500天和10 + 1(1到10和99)个种子,因此正在寻找一种有效的解决方案。

     Year Cluster Seed Day value
1     1       a    1   1    60
2     1       a    1   2    68
3     1       a    1   3    78
4     1       a    1   4    90
5     1       a    1   5   107
6     1       a    2   1    61
7     1       a    2   2    73
8     1       a    2   3    86
9     1       a    2   4    91
10    1       a    2   5   104
11    1       a    3   1    57
12    1       a    3   2    67
13    1       a    3   3    79
14    1       a    3   4    96
15    1       a    3   5   105
16    1       c   99   1    10
17    2       b    1   1    60
18    2       b    1   2    68
19    2       b    1   3    78
20    2       b    1   4    90
21    2       b    1   5   107
22    2       b    2   1    61
23    2       b    2   2    73
24    2       b    2   3    86
25    2       b    2   4    91
26    2       b    2   5   104
27    2       b    3   1    57
28    2       b    3   2    67
29    2       b    3   3    79
30    2       b    3   4    96
31    2       b    3   5   105
32    2       d   99   1    10

2 个答案:

答案 0 :(得分:0)

一种library(data.table) df <- setDT(df)[, `:=` (value = ifelse(Seed != 99, value + value[Seed == 99], value), flag = Seed == 99 & .N == 1), by = .(Year, Cluster, Day)][!(Seed == 99 & flag == FALSE),][, "flag" := NULL] 方法:

df[]

    Year Cluster Seed Day value
 1:    1       a    1   1    60
 2:    1       a    1   2    68
 3:    1       a    1   3    78
 4:    1       a    1   4    90
 5:    1       a    1   5   107
 6:    1       a    2   1    61
 7:    1       a    2   2    73
 8:    1       a    2   3    86
 9:    1       a    2   4    91
10:    1       a    2   5   104
11:    1       a    3   1    57
12:    1       a    3   2    67
13:    1       a    3   3    79
14:    1       a    3   4    96
15:    1       a    3   5   105
16:    1       c   99   1    10
17:    2       b    1   1    60
18:    2       b    1   2    68
19:    2       b    1   3    78
20:    2       b    1   4    90
21:    2       b    1   5   107
22:    2       b    2   1    61
23:    2       b    2   2    73
24:    2       b    2   3    86
25:    2       b    2   4    91
26:    2       b    2   5   104
27:    2       b    3   1    57
28:    2       b    3   2    67
29:    2       b    3   3    79
30:    2       b    3   4    96
31:    2       b    3   5   105
32:    2       d   99   1    10

输出:

{{1}}

答案 1 :(得分:0)

这是使用tidyverse的方法。如果您要寻找具有一百万行的速度,那么data.table解决方案可能会更好。

library(tidyverse)

df <- data.frame(Year=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2),
                 Cluster=c("a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","a","c","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","b","d"),
                 Seed=c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,99,99,99,99,99,99),
                 Day=c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1),
                 value=c(5,2,1,2,8,6,7,9,3,5,2,1,2,8,6,55,66,77,88,99,10))

seeds <- df %>% 
  filter(Seed == 99) 

matches <- df %>% 
  filter(Seed != 99) %>% 
  inner_join(select(seeds, -Seed), by = c("Year", "Cluster", "Day")) %>% 
  mutate(value = value.x + value.y) %>% 
  select(Year, Cluster, Seed, Day, value)

no_matches <- anti_join(seeds, matches, by = c("Year", "Cluster", "Day"))

bind_rows(matches, no_matches) %>% 
  arrange(Year, Cluster, Seed, Day)
#>    Year Cluster Seed Day value
#> 1     1       a    1   1    60
#> 2     1       a    1   2    68
#> 3     1       a    1   3    78
#> 4     1       a    1   4    90
#> 5     1       a    1   5   107
#> 6     1       a    2   1    61
#> 7     1       a    2   2    73
#> 8     1       a    2   3    86
#> 9     1       a    2   4    91
#> 10    1       a    2   5   104
#> 11    1       a    3   1    57
#> 12    1       a    3   2    67
#> 13    1       a    3   3    79
#> 14    1       a    3   4    96
#> 15    1       a    3   5   105
#> 16    1       c   99   1    10
#> 17    2       b    1   1    60
#> 18    2       b    1   2    68
#> 19    2       b    1   3    78
#> 20    2       b    1   4    90
#> 21    2       b    1   5   107
#> 22    2       b    2   1    61
#> 23    2       b    2   2    73
#> 24    2       b    2   3    86
#> 25    2       b    2   4    91
#> 26    2       b    2   5   104
#> 27    2       b    3   1    57
#> 28    2       b    3   2    67
#> 29    2       b    3   3    79
#> 30    2       b    3   4    96
#> 31    2       b    3   5   105
#> 32    2       d   99   1    10

reprex package(v0.2.1)于2018-11-23创建