我正在尝试查看(不删除)数据框中重复的行。我遇到的问题是,当我使用duplicated
时,它假定每个重复组中的一行是原始行,并且不提供它。我需要看到所有有重复的行。我已经查看了堆栈和谷歌,并没有看到修复。有谁知道这样做的方法?提前谢谢。
数据:
> dput(testx1)
structure(list(DISASTER_NUMBER = c(1921L, 1921L, 1921L, 1921L,
1921L, 1921L, 1921L, 1921L, 1922L, 1922L, 1922L, 1922L, 1922L,
1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L,
1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L, 1922L
), PW_NUMBER = c(498L, 500L, 501L, 502L, 510L, 519L, 542L, 542L,
1L, 1L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 14L, 14L, 15L, 15L, 15L,
16L, 16L, 16L, 17L, 17L, 18L, 18L, 18L, 18L), VERSION_NUMBER = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 2L), PROJECT_AMOUNT = c(53388,
84, 2912, 13555, 12921, 53068, 1738887, 23101, 12792, 3986, 14701,
13544, 18120, 20066, 525251, 0, 11976, 16016, 12025, 3363, 29894,
23845, 4120, 3550, 2261, 3327, 17521, 2670, 54467, 163913, 220707
), TOTAL_ELIGIBLE = c(53388, 84, 2912, 13555, 12921, 53068, 1738887,
23101, 12792, 3986, 14701, 13544, 18120, 20066, 525251, 0, 11976,
16016, 12025, 3363, 29894, 23845, 4120, 3550, 2261, 3327, 17521,
2670, 54467, 163913, 220707), TOTAL_OBLIGATED = c(40041, 63,
2184, 10167, 9690, 39801, 1304165, 17326, 9594, 2990, 11025,
13544, 13590, 15050, 525251, 0, 8982, 12012, 9019, 2522, 29894,
23845, 3090, 3550, 2261, 3327, 13141, 2670, 40850, 122935, 0),
MITIGATION_COST = c(0, 0, 0, 13555, 2250, 0, 1028338, 0,
3987, 0, 18120, 18120, 0, 97426, 97426, 0, 0, 9060, 0, 19129,
19129, 0, 3966, 3966, 0, 8712, 8712, 18327, 18327, -10768,
0)), .Names = c("DISASTER_NUMBER", "PW_NUMBER", "VERSION_NUMBER",
"PROJECT_AMOUNT", "TOTAL_ELIGIBLE", "TOTAL_OBLIGATED", "MITIGATION_COST"
), row.names = 77710:77740, class = "data.frame")
代码:
testx2.0 <- testx1 %>% subset(select = DISASTER_NUMBER:VERSION_NUMBER)
testx2.1 <- which(duplicated(testx2.0))
testx2.2 <- testx1[testx2.1, ]
答案 0 :(得分:0)
也许你正在寻找这个:
library(dplyr)
testx2.0 <- testx1 %>% select(DISASTER_NUMBER:VERSION_NUMBER)
testx2.1 <- testx2.0 %>%
group_by(DISASTER_NUMBER,PW_NUMBER,VERSION_NUMBER) %>%
mutate(dups = n())
结果:
> testx2.1
DISASTER_NUMBER PW_NUMBER VERSION_NUMBER dups
1 1921 498 0 1
2 1921 500 0 1
3 1921 501 0 1
4 1921 502 0 1
5 1921 510 0 1
6 1921 519 0 1
7 1921 542 0 1
8 1921 542 1 1
9 1922 1 0 1
10 1922 1 1 1
11 1922 7 0 2
12 1922 7 0 2
13 1922 7 1 1
14 1922 9 0 2
15 1922 9 0 2
16 1922 9 1 2
17 1922 9 1 2
18 1922 14 0 1
19 1922 14 1 1
20 1922 15 0 2
21 1922 15 0 2
22 1922 15 1 1
23 1922 16 0 2
24 1922 16 0 2
25 1922 16 1 1
26 1922 17 0 2
27 1922 17 0 2
28 1922 18 0 2
29 1922 18 0 2
30 1922 18 1 1
31 1922 18 2 1
答案 1 :(得分:0)
也许这就是你要找的东西? df是你的数据帧。
library(tidyverse)
duplicates <- df %>%
mutate(duplicate = 1) %>%
group_by(DISASTER_NUMBER, PW_NUMBER, VERSION_NUMBER) %>%
summarise(sum_duplicate = sum(duplicate)) %>%
filter(sum_duplicate > 1)
final <- left_join(df, duplicates, by = c("DISASTER_NUMBER" = "DISASTER_NUMBER", "PW_NUMBER" = "PW_NUMBER", "VERSION_NUMBER" = "VERSION_NUMBER")) %>%
mutate(duplicate=ifelse(is.na(sum_duplicate)==TRUE, FALSE, TRUE)) %>%
select(-sum_duplicate)
决赛桌如下:
DISASTER_NUMBER PW_NUMBER VERSION_NUMBER PROJECT_AMOUNT TOTAL_ELIGIBLE TOTAL_OBLIGATED MITIGATION_COST duplicate
1 1921 498 0 53388 53388 40041 0 FALSE
2 1921 500 0 84 84 63 0 FALSE
3 1921 501 0 2912 2912 2184 0 FALSE
4 1921 502 0 13555 13555 10167 13555 FALSE
5 1921 510 0 12921 12921 9690 2250 FALSE
6 1921 519 0 53068 53068 39801 0 FALSE
7 1921 542 0 1738887 1738887 1304165 1028338 FALSE
8 1921 542 1 23101 23101 17326 0 FALSE
9 1922 1 0 12792 12792 9594 3987 FALSE
10 1922 1 1 3986 3986 2990 0 FALSE
11 1922 7 0 14701 14701 11025 18120 TRUE
12 1922 7 0 13544 13544 13544 18120 TRUE
13 1922 7 1 18120 18120 13590 0 FALSE
14 1922 9 0 20066 20066 15050 97426 TRUE
15 1922 9 0 525251 525251 525251 97426 TRUE
16 1922 9 1 0 0 0 0 TRUE
17 1922 9 1 11976 11976 8982 0 TRUE
18 1922 14 0 16016 16016 12012 9060 FALSE
19 1922 14 1 12025 12025 9019 0 FALSE
20 1922 15 0 3363 3363 2522 19129 TRUE
21 1922 15 0 29894 29894 29894 19129 TRUE
22 1922 15 1 23845 23845 23845 0 FALSE
23 1922 16 0 4120 4120 3090 3966 TRUE
24 1922 16 0 3550 3550 3550 3966 TRUE
25 1922 16 1 2261 2261 2261 0 FALSE
26 1922 17 0 3327 3327 3327 8712 TRUE
27 1922 17 0 17521 17521 13141 8712 TRUE
28 1922 18 0 2670 2670 2670 18327 TRUE
29 1922 18 0 54467 54467 40850 18327 TRUE
30 1922 18 1 163913 163913 122935 -10768 FALSE
31 1922 18 2 220707 220707 0 0 FALSE