group_by,distinct和多个级别的重复条件

时间:2016-12-28 05:41:25

标签: r dplyr

我有一个df,它有3个分层分组级别和4种类型的成本。对于项目,组是一对多,并且使用版本对项目进行一对多项目。 costs2.1a看起来像这样(~500krows):

+-------+---------+----------+----------+---------+---------+---------+
| group | project | versions | amount1  | amount2 | amount3 | amount4 |
+-------+---------+----------+----------+---------+---------+---------+
|  1316 |     142 |        0 |   370662 |  370662 |  288264 |    4688 |
|  1316 |     142 |        1 |   174242 |  174242 |  134837 |       0 |
|  1316 |     142 |        1 |   174242 |  174242 |  134837 |  159242 |
|  1316 |     142 |        2 |  -246912 | -246912 | -191073 |       0 |
|  1316 |     142 |        2 |  -246912 | -246912 | -191073 |       0 |
|  1316 |     142 |        2 |  -246912 | -246912 | -191073 |   60591 |
|  1316 |     142 |        2 |  -246912 | -246912 | -191073 |    5000 |
|  1316 |     142 |        3 |   -32437 |  -32437 |  -25101 |       0 |
|  1316 |     142 |        3 |   -32437 |  -32437 |  -25101 |       0 |
|  1316 |     142 |        3 |   -32437 |  -32437 |  -25101 |       0 |
|  1316 |     142 |        3 |   -32437 |  -32437 |  -25101 |       0 |
|  1316 |     143 |        0 |   620515 |  620515 |  480186 |  411400 |
|  1316 |     143 |        1 |   -31113 |  -31113 |  -24077 |       0 |
|  1316 |     151 |        0 |   515269 |  515269 |  398982 |     510 |
|  1316 |     151 |        1 |    85380 |   85380 |   65213 |   85380 |
+-------+---------+----------+----------+---------+---------+---------+

我想先group by'group''project'变量'versions'。然后我想删除'amount 1' : 'amount 4'上的重复项。然后,我想返回并删除'amount 1' : 'amount 3'(仍然持有分组)if 'amount 4' = 0的重复项。

我试图通过几种方式找出答案,包括使用if循环,但最接近答案的是这段代码:

library(dplyr)
costs2.1a <- cbind(mit2.1filter[ , 2:4], costsSub) %>% 
  group_by(DISASTER_NUMBER, PW_NUMBER, VERSION_NUMBER) %>%
  distinct(PROJECT_AMOUNT, TOTAL_ELIGIBLE, TOTAL_OBLIGATED, MITIGATION_COST, .keep_all = TRUE)
costs2.1b <- costs2.1a[which(distinct(costs2.1a, PROJECT_AMOUNT, TOTAL_ELIGIBLE, TOTAL_OBLIGATED, .keep_all = TRUE) & costs2.1a$MITIGATION_COST != 0), ]

我知道它不是我想要的,因为costs2.1a有173,871行,costs2.1b有366,968行;这没有意义。

如果有人知道我在这段代码中做错了什么,或者以其他方式做我想做的事,那将非常感谢帮助!

修改

Heres一些输入数据,谢谢Jazzaro。

> dput(head(cbind(mit2.1filter[ , 2:4], costsSub), 30))

structure(list(DISASTER_NUMBER = c(1301L, 1301L, 1301L, 1301L, 
1301L, 1301L, 1301L, 1301L, 1302L, 1302L, 1302L, 1302L, 1302L, 
1302L, 1302L, 1302L, 1302L, 1302L, 1303L, 1303L, 1303L, 1305L, 
1305L, 1305L, 1306L, 1306L, 1306L, 1306L, 1306L, 1306L), PW_NUMBER = c(6L, 
35L, 70L, 71L, 83L, 121L, 121L, 125L, 9L, 37L, 37L, 58L, 60L, 
62L, 65L, 124L, 124L, 124L, 184L, 184L, 184L, 10L, 28L, 29L, 
1235L, 1235L, 1349L, 1349L, 1349L, 1349L), VERSION_NUMBER = c(0L, 
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 
2L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 2L, 2L), PROJECT_AMOUNT = c(6787, 
16402, 33843, 28169, 12148, 217010, -151101, 13786, 1625, 60260, 
79755, 39894, 11352, 7521, 2950, 82010, 20316, 2646, 243190, 
-111638, -14021, 3364, 11421, 7534, 184012, -184012, 229357, 
0, -65000, -65000), TOTAL_ELIGIBLE = c(6787, 16402, 33843, 28169, 
12148, 217010, -151101, 13786, 1625, 60260, 79755, 39894, 11352, 
7521, 2950, 82010, 20316, 2646, 243190, -111638, -14021, 3364, 
11421, 7534, 184012, -184012, 229357, 0, -65000, -65000), TOTAL_OBLIGATED = c(5347, 
12921, 26320, 21907, 9745, 168768, -117512, 10721, 1306, 46878, 
62026, 31740, 9032, 5984, 2347, 64607, 15855, 2058, 189129, -86391, 
-10850, 2703, 9087, 5994, 139624, -139624, 174030, 0, -49320, 
-49320), MITIGATION_COST = c(1780, 1575, 1500, 6600, 0, 3885, 
0, 6230, 0, 0, 0, 10222, 1410, 1821, 528, 0, 0, 0, 0, 27187, 
0, 1050, 3280, 204, 87984, 0, 65000, 0, -65000, 0)), .Names = c("DISASTER_NUMBER", 
"PW_NUMBER", "VERSION_NUMBER", "PROJECT_AMOUNT", "TOTAL_ELIGIBLE", 
"TOTAL_OBLIGATED", "MITIGATION_COST"), row.names = c(NA, 30L), class = "data.frame")

> dput(head(costs2.1a, 30))  ##this is output data!!

structure(list(DISASTER_NUMBER = c(1301L, 1301L, 1301L, 1301L, 
1301L, 1301L, 1301L, 1301L, 1302L, 1302L, 1302L, 1302L, 1302L, 
1302L, 1302L, 1302L, 1302L, 1302L, 1303L, 1303L, 1303L, 1305L, 
1305L, 1305L, 1306L, 1306L, 1306L, 1306L, 1306L, 1306L), PW_NUMBER = c(6L, 
35L, 70L, 71L, 83L, 121L, 121L, 125L, 9L, 37L, 37L, 58L, 60L, 
62L, 65L, 124L, 124L, 124L, 184L, 184L, 184L, 10L, 28L, 29L, 
1235L, 1235L, 1349L, 1349L, 1349L, 1349L), VERSION_NUMBER = c(0L, 
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 
2L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 2L, 2L), PROJECT_AMOUNT = c(6787, 
16402, 33843, 28169, 12148, 217010, -151101, 13786, 1625, 60260, 
79755, 39894, 11352, 7521, 2950, 82010, 20316, 2646, 243190, 
-111638, -14021, 3364, 11421, 7534, 184012, -184012, 229357, 
0, -65000, -65000), TOTAL_ELIGIBLE = c(6787, 16402, 33843, 28169, 
12148, 217010, -151101, 13786, 1625, 60260, 79755, 39894, 11352, 
7521, 2950, 82010, 20316, 2646, 243190, -111638, -14021, 3364, 
11421, 7534, 184012, -184012, 229357, 0, -65000, -65000), TOTAL_OBLIGATED = c(5347, 
12921, 26320, 21907, 9745, 168768, -117512, 10721, 1306, 46878, 
62026, 31740, 9032, 5984, 2347, 64607, 15855, 2058, 189129, -86391, 
-10850, 2703, 9087, 5994, 139624, -139624, 174030, 0, -49320, 
-49320), MITIGATION_COST = c(1780, 1575, 1500, 6600, 0, 3885, 
0, 6230, 0, 0, 0, 10222, 1410, 1821, 528, 0, 0, 0, 0, 27187, 
0, 1050, 3280, 204, 87984, 0, 65000, 0, -65000, 0)), .Names = c("DISASTER_NUMBER", 
"PW_NUMBER", "VERSION_NUMBER", "PROJECT_AMOUNT", "TOTAL_ELIGIBLE", 
"TOTAL_OBLIGATED", "MITIGATION_COST"), row.names = c(NA, -30L
), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = list(
    DISASTER_NUMBER, PW_NUMBER, VERSION_NUMBER), drop = TRUE, indices = list(
    0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 
    14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 
    26L, 27L, 28:29), group_sizes = c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L), biggest_group_size = 2L, labels = structure(list(
    DISASTER_NUMBER = c(1301L, 1301L, 1301L, 1301L, 1301L, 1301L, 
    1301L, 1301L, 1302L, 1302L, 1302L, 1302L, 1302L, 1302L, 1302L, 
    1302L, 1302L, 1302L, 1303L, 1303L, 1303L, 1305L, 1305L, 1305L, 
    1306L, 1306L, 1306L, 1306L, 1306L), PW_NUMBER = c(6L, 35L, 
    70L, 71L, 83L, 121L, 121L, 125L, 9L, 37L, 37L, 58L, 60L, 
    62L, 65L, 124L, 124L, 124L, 184L, 184L, 184L, 10L, 28L, 29L, 
    1235L, 1235L, 1349L, 1349L, 1349L), VERSION_NUMBER = c(0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
    1L, 2L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 2L)), row.names = c(NA, 
-29L), class = "data.frame", vars = list(DISASTER_NUMBER, PW_NUMBER, 
    VERSION_NUMBER), drop = TRUE, .Names = c("DISASTER_NUMBER", 
"PW_NUMBER", "VERSION_NUMBER")))

> dput(droplevels(head(costs2.1a, 30)))  ##this is output data!!

structure(list(DISASTER_NUMBER = c(1301L, 1301L, 1301L, 1301L, 
1301L, 1301L, 1301L, 1301L, 1302L, 1302L, 1302L, 1302L, 1302L, 
1302L, 1302L, 1302L, 1302L, 1302L, 1303L, 1303L, 1303L, 1305L, 
1305L, 1305L, 1306L, 1306L, 1306L, 1306L, 1306L, 1306L), PW_NUMBER = c(6L, 
35L, 70L, 71L, 83L, 121L, 121L, 125L, 9L, 37L, 37L, 58L, 60L, 
62L, 65L, 124L, 124L, 124L, 184L, 184L, 184L, 10L, 28L, 29L, 
1235L, 1235L, 1349L, 1349L, 1349L, 1349L), VERSION_NUMBER = c(0L, 
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 
2L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 2L, 2L), PROJECT_AMOUNT = c(6787, 
16402, 33843, 28169, 12148, 217010, -151101, 13786, 1625, 60260, 
79755, 39894, 11352, 7521, 2950, 82010, 20316, 2646, 243190, 
-111638, -14021, 3364, 11421, 7534, 184012, -184012, 229357, 
0, -65000, -65000), TOTAL_ELIGIBLE = c(6787, 16402, 33843, 28169, 
12148, 217010, -151101, 13786, 1625, 60260, 79755, 39894, 11352, 
7521, 2950, 82010, 20316, 2646, 243190, -111638, -14021, 3364, 
11421, 7534, 184012, -184012, 229357, 0, -65000, -65000), TOTAL_OBLIGATED = c(5347, 
12921, 26320, 21907, 9745, 168768, -117512, 10721, 1306, 46878, 
62026, 31740, 9032, 5984, 2347, 64607, 15855, 2058, 189129, -86391, 
-10850, 2703, 9087, 5994, 139624, -139624, 174030, 0, -49320, 
-49320), MITIGATION_COST = c(1780, 1575, 1500, 6600, 0, 3885, 
0, 6230, 0, 0, 0, 10222, 1410, 1821, 528, 0, 0, 0, 0, 27187, 
0, 1050, 3280, 204, 87984, 0, 65000, 0, -65000, 0)), .Names = c("DISASTER_NUMBER", 
"PW_NUMBER", "VERSION_NUMBER", "PROJECT_AMOUNT", "TOTAL_ELIGIBLE", 
"TOTAL_OBLIGATED", "MITIGATION_COST"), row.names = c(NA, -30L
), vars = list(DISASTER_NUMBER, PW_NUMBER, VERSION_NUMBER), drop = TRUE, indices = list(
    0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 
    14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 
    26L, 27L, 28:29), group_sizes = c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L), biggest_group_size = 2L, labels = structure(list(
    DISASTER_NUMBER = c(1301L, 1301L, 1301L, 1301L, 1301L, 1301L, 
    1301L, 1301L, 1302L, 1302L, 1302L, 1302L, 1302L, 1302L, 1302L, 
    1302L, 1302L, 1302L, 1303L, 1303L, 1303L, 1305L, 1305L, 1305L, 
    1306L, 1306L, 1306L, 1306L, 1306L), PW_NUMBER = c(6L, 35L, 
    70L, 71L, 83L, 121L, 121L, 125L, 9L, 37L, 37L, 58L, 60L, 
    62L, 65L, 124L, 124L, 124L, 184L, 184L, 184L, 10L, 28L, 29L, 
    1235L, 1235L, 1349L, 1349L, 1349L), VERSION_NUMBER = c(0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
    1L, 2L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 2L)), row.names = c(NA, 
-29L), class = "data.frame", vars = list(DISASTER_NUMBER, PW_NUMBER, 
    VERSION_NUMBER), drop = TRUE, .Names = c("DISASTER_NUMBER", 
"PW_NUMBER", "VERSION_NUMBER")), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame")) 

1 个答案:

答案 0 :(得分:0)

第一步相对简单,我们使用gather重新塑造,使用distinct仅保留唯一值,然后重新塑造。请注意,我们完全丢失了amount2,因为示例数据中没有值是唯一的。

d <- costs2.1a %>% 
  gather('key', 'value', amount1:amount4) %>% 
  distinct(group, project, versions, value, .keep_all = TRUE) %>% 
  spread(key, value)

第二部分更难,因为我们无法有条件地使用distinct。所以我们再次重新整形(d2),然后仅对子集(distinct)使用d3,对其余数据(d4)不执行任何操作,然后绑定然后一起重新塑造。

d2 <- d %>% 
  gather('key', 'value', amount1:amount3)

d3 <- d2 %>% 
  filter(amount4 == 0) %>% 
  distinct(group, project, versions, value, .keep_all = TRUE)

d4 <- d2 %>% filter(amount4 != 0)

bind_rows(d3, d4) %>% 
  spread(key, value)

数据:

costs2.1a <- as.data.frame(structure(list(DISASTER_NUMBER = c(1301L, 1301L, 1301L, 1301L, 
                                   1301L, 1301L, 1301L, 1301L, 1302L, 1302L, 1302L, 1302L, 1302L, 
                                   1302L, 1302L, 1302L, 1302L, 1302L, 1303L, 1303L, 1303L, 1305L, 
                                   1305L, 1305L, 1306L, 1306L, 1306L, 1306L, 1306L, 1306L), 
               PW_NUMBER = c(6L, 
                             35L, 70L, 71L, 83L, 121L, 121L, 125L, 9L, 37L, 37L, 58L, 60L, 
                             62L, 65L, 124L, 124L, 124L, 184L, 184L, 184L, 10L, 28L, 29L, 
                             1235L, 1235L, 1349L, 1349L, 1349L, 1349L), 
               VERSION_NUMBER = c(0L, 
                                  0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 
                                  2L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 2L, 2L), 
               PROJECT_AMOUNT = c(6787, 
                                  16402, 33843, 28169, 12148, 217010, -151101, 13786, 1625, 60260, 
                                  79755, 39894, 11352, 7521, 2950, 82010, 20316, 2646, 243190, 
                                  -111638, -14021, 3364, 11421, 7534, 184012, -184012, 229357, 
                                  0, -65000, -65000), 
               TOTAL_ELIGIBLE = c(6787, 16402, 33843, 28169, 
                                  12148, 217010, -151101, 13786, 1625, 60260, 79755, 39894, 11352, 
                                  7521, 2950, 82010, 20316, 2646, 243190, -111638, -14021, 3364, 
                                  11421, 7534, 184012, -184012, 229357, 0, -65000, -65000), 
               TOTAL_OBLIGATED = c(5347, 
                                   12921, 26320, 21907, 9745, 168768, -117512, 10721, 1306, 46878, 
                                   62026, 31740, 9032, 5984, 2347, 64607, 15855, 2058, 189129, -86391, 
                                   -10850, 2703, 9087, 5994, 139624, -139624, 174030, 0, -49320, 
                                   -49320), 
               MITIGATION_COST = c(1780, 1575, 1500, 6600, 0, 3885, 
                                   0, 6230, 0, 0, 0, 10222, 1410, 1821, 528, 0, 0, 0, 0, 27187, 
                                   0, 1050, 3280, 204, 87984, 0, 65000, 0, -65000, 0)), 
          .Names = c("DISASTER_NUMBER", 
                     "PW_NUMBER", "VERSION_NUMBER", "PROJECT_AMOUNT", "TOTAL_ELIGIBLE", 
                     "TOTAL_OBLIGATED", "MITIGATION_COST"), row.names = c(NA, -30L
                     )))

names(costs2.1a) <- c('group', 'project', 'versions', 'amount1', 'amount2', 'amount3', 'amount4')