duplicated()留下一行,我怎么看所有重复的?

时间:2016-12-29 16:20:54

标签: r

我正在尝试查看(不删除)数据框中重复的行。我遇到的问题是,当我使用duplicated时,它假定每个重复组中的一行是原始行,并且不提供它。我需要看到所有有重复的行。我已经查看了堆栈和谷歌,并没有看到修复。有谁知道这样做的方法?提前谢谢。

数据:

    > dput(testx1)
structure(list(DISASTER_NUMBER = c(1921L, 1921L, 1921L, 1921L, 
1921L, 1921L, 1921L, 1921L, 1922L, 1922L, 1922L, 1922L, 1922L, 
1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 
1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L
), PW_NUMBER = c(498L, 500L, 501L, 502L, 510L, 519L, 542L, 542L, 
1L, 1L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 14L, 14L, 15L, 15L, 15L, 
16L, 16L, 16L, 17L, 17L, 18L, 18L, 18L, 18L), VERSION_NUMBER = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 
0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 2L), PROJECT_AMOUNT = c(53388, 
84, 2912, 13555, 12921, 53068, 1738887, 23101, 12792, 3986, 14701, 
13544, 18120, 20066, 525251, 0, 11976, 16016, 12025, 3363, 29894, 
23845, 4120, 3550, 2261, 3327, 17521, 2670, 54467, 163913, 220707
), TOTAL_ELIGIBLE = c(53388, 84, 2912, 13555, 12921, 53068, 1738887, 
23101, 12792, 3986, 14701, 13544, 18120, 20066, 525251, 0, 11976, 
16016, 12025, 3363, 29894, 23845, 4120, 3550, 2261, 3327, 17521, 
2670, 54467, 163913, 220707), TOTAL_OBLIGATED = c(40041, 63, 
2184, 10167, 9690, 39801, 1304165, 17326, 9594, 2990, 11025, 
13544, 13590, 15050, 525251, 0, 8982, 12012, 9019, 2522, 29894, 
23845, 3090, 3550, 2261, 3327, 13141, 2670, 40850, 122935, 0), 
    MITIGATION_COST = c(0, 0, 0, 13555, 2250, 0, 1028338, 0, 
    3987, 0, 18120, 18120, 0, 97426, 97426, 0, 0, 9060, 0, 19129, 
    19129, 0, 3966, 3966, 0, 8712, 8712, 18327, 18327, -10768, 
    0)), .Names = c("DISASTER_NUMBER", "PW_NUMBER", "VERSION_NUMBER", 
"PROJECT_AMOUNT", "TOTAL_ELIGIBLE", "TOTAL_OBLIGATED", "MITIGATION_COST"
), row.names = 77710:77740, class = "data.frame")

代码:

testx2.0 <- testx1 %>% subset(select = DISASTER_NUMBER:VERSION_NUMBER)
testx2.1 <- which(duplicated(testx2.0))
testx2.2 <- testx1[testx2.1, ]

2 个答案:

答案 0 :(得分:0)

也许你正在寻找这个:

library(dplyr)
testx2.0 <- testx1 %>% select(DISASTER_NUMBER:VERSION_NUMBER)
testx2.1 <- testx2.0 %>% 
  group_by(DISASTER_NUMBER,PW_NUMBER,VERSION_NUMBER) %>% 
  mutate(dups = n())

结果:

> testx2.1
   DISASTER_NUMBER PW_NUMBER VERSION_NUMBER dups
1             1921       498              0    1
2             1921       500              0    1
3             1921       501              0    1
4             1921       502              0    1
5             1921       510              0    1
6             1921       519              0    1
7             1921       542              0    1
8             1921       542              1    1
9             1922         1              0    1
10            1922         1              1    1
11            1922         7              0    2
12            1922         7              0    2
13            1922         7              1    1
14            1922         9              0    2
15            1922         9              0    2
16            1922         9              1    2
17            1922         9              1    2
18            1922        14              0    1
19            1922        14              1    1
20            1922        15              0    2
21            1922        15              0    2
22            1922        15              1    1
23            1922        16              0    2
24            1922        16              0    2
25            1922        16              1    1
26            1922        17              0    2
27            1922        17              0    2
28            1922        18              0    2
29            1922        18              0    2
30            1922        18              1    1
31            1922        18              2    1

答案 1 :(得分:0)

也许这就是你要找的东西? df是你的数据帧。

library(tidyverse)  

duplicates <- df %>%
  mutate(duplicate = 1) %>%
    group_by(DISASTER_NUMBER, PW_NUMBER, VERSION_NUMBER) %>%
      summarise(sum_duplicate = sum(duplicate)) %>%
        filter(sum_duplicate > 1)

final <- left_join(df, duplicates, by = c("DISASTER_NUMBER" = "DISASTER_NUMBER", "PW_NUMBER" = "PW_NUMBER", "VERSION_NUMBER" = "VERSION_NUMBER")) %>%
  mutate(duplicate=ifelse(is.na(sum_duplicate)==TRUE, FALSE, TRUE)) %>%
    select(-sum_duplicate)

决赛桌如下:

 DISASTER_NUMBER PW_NUMBER VERSION_NUMBER PROJECT_AMOUNT TOTAL_ELIGIBLE TOTAL_OBLIGATED MITIGATION_COST duplicate
1             1921       498              0          53388          53388           40041               0     FALSE
2             1921       500              0             84             84              63               0     FALSE
3             1921       501              0           2912           2912            2184               0     FALSE
4             1921       502              0          13555          13555           10167           13555     FALSE
5             1921       510              0          12921          12921            9690            2250     FALSE
6             1921       519              0          53068          53068           39801               0     FALSE
7             1921       542              0        1738887        1738887         1304165         1028338     FALSE
8             1921       542              1          23101          23101           17326               0     FALSE
9             1922         1              0          12792          12792            9594            3987     FALSE
10            1922         1              1           3986           3986            2990               0     FALSE
11            1922         7              0          14701          14701           11025           18120      TRUE
12            1922         7              0          13544          13544           13544           18120      TRUE
13            1922         7              1          18120          18120           13590               0     FALSE
14            1922         9              0          20066          20066           15050           97426      TRUE
15            1922         9              0         525251         525251          525251           97426      TRUE
16            1922         9              1              0              0               0               0      TRUE
17            1922         9              1          11976          11976            8982               0      TRUE
18            1922        14              0          16016          16016           12012            9060     FALSE
19            1922        14              1          12025          12025            9019               0     FALSE
20            1922        15              0           3363           3363            2522           19129      TRUE
21            1922        15              0          29894          29894           29894           19129      TRUE
22            1922        15              1          23845          23845           23845               0     FALSE
23            1922        16              0           4120           4120            3090            3966      TRUE
24            1922        16              0           3550           3550            3550            3966      TRUE
25            1922        16              1           2261           2261            2261               0     FALSE
26            1922        17              0           3327           3327            3327            8712      TRUE
27            1922        17              0          17521          17521           13141            8712      TRUE
28            1922        18              0           2670           2670            2670           18327      TRUE
29            1922        18              0          54467          54467           40850           18327      TRUE
30            1922        18              1         163913         163913          122935          -10768     FALSE
31            1922        18              2         220707         220707               0               0     FALSE