根据R中的多列查找数据框中的重复行

时间:2018-09-04 15:01:18

标签: r dataframe dplyr duplicates

我有一个数据集,其中有一些实例,对于给定位置,同一日期和时间的值是不同的。我正在尝试创建一个显示这些实例的子集数据框。这是我的意思的一个例子: enter image description here

我已经在SO上看过类似的问题,但是我似乎无法获得想要的东西。我不断找回情况并非如此的实例。

这是我正在使用的代码:

dat1<-data_concern_join2%>%
 group_by(locid,stdate,sttime,charnam,valunit)%>%
  filter(n()>1) 

样本数据:

structure(list(orgid = c("USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ", 
"USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ", 
"USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ"), locid = c("USGS-01396030", 
"USGS-01396030", "USGS-01389850", "USGS-01389850", "USGS-01391500", 
"USGS-01391500", "USGS-01391500", "USGS-01391500", "USGS-01393960", 
"USGS-01390610", "USGS-01391500", "USGS-01390610", "USGS-01391500", 
"USGS-01378760", "USGS-01378760"), stdate = structure(c(11359, 
11359, 11359, 11359, 16504, 16504, 16112, 16112, 11367, 13860, 
12401, 13860, 16325, 13006, 13006), class = "Date"), sttime = structure(c(34200, 
34200, 36000, 36000, 40500, 40500, 39600, 39600, 36000, 39600, 
32400, 39600, 38400, 36900, 36900), class = c("hms", "difftime"
), units = "secs"), charnam = c("Specific conductance", "Specific conductance", 
"Specific conductance", "Specific conductance", "Specific conductance", 
"Specific conductance", "Specific conductance", "Specific conductance", 
"Specific conductance", "Specific conductance", "Specific conductance", 
"Specific conductance", "Specific conductance", "Specific conductance", 
"Specific conductance"), val = c(7450, 7300, 3210, 3180, 1930, 
1920, 1740, 1650, 1480, 1390, 1380, 1330, 1300, 1280, 1270), 
    valunit = c("uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", 
    "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", 
    "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", 
    "uS/cm @25C"), swqs = c("FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT", 
    "FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT", 
    "FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT"), WMA = c(7L, 
    7L, 4L, 4L, 4L, 4L, 4L, 4L, 7L, 4L, 4L, 4L, 4L, 6L, 6L), 
    year = c(2001L, 2001L, 2001L, 2001L, 2015L, 2015L, 2014L, 
    2014L, 2001L, 2007L, 2003L, 2007L, 2014L, 2005L, 2005L), 
    HUC14 = c("HUC02030104050090", "HUC02030104050090", "HUC02030103120050", 
    "HUC02030103120050", "HUC02030103140070", "HUC02030103140070", 
    "HUC02030103140070", "HUC02030103140070", "HUC02030104050010", 
    "HUC02030103140010", "HUC02030103140070", "HUC02030103140010", 
    "HUC02030103140070", "HUC02030103010040", "HUC02030103010040"
    )), .Names = c("orgid", "locid", "stdate", "sttime", "charnam", 
"val", "valunit", "swqs", "WMA", "year", "HUC14"), row.names = c(NA, 
-15L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = c("locid", 
"stdate", "sttime", "charnam", "valunit"), drop = TRUE, indices = list(
    13:14, 2:3, c(9L, 11L), 10L, 6:7, 12L, 4:5, 8L, 0:1), group_sizes = c(2L, 
2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), biggest_group_size = 2L, labels = structure(list(
    locid = c("USGS-01378760", "USGS-01389850", "USGS-01390610", 
    "USGS-01391500", "USGS-01391500", "USGS-01391500", "USGS-01391500", 
    "USGS-01393960", "USGS-01396030"), stdate = structure(c(13006, 
    11359, 13860, 12401, 16112, 16325, 16504, 11367, 11359), class = "Date"), 
    sttime = structure(c(36900, 36000, 39600, 32400, 39600, 38400, 
    40500, 36000, 34200), class = c("hms", "difftime"), units = "secs"), 
    charnam = c("Specific conductance", "Specific conductance", 
    "Specific conductance", "Specific conductance", "Specific conductance", 
    "Specific conductance", "Specific conductance", "Specific conductance", 
    "Specific conductance"), valunit = c("uS/cm @25C", "uS/cm @25C", 
    "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", 
    "uS/cm @25C", "uS/cm @25C")), row.names = c(NA, -9L), class = "data.frame", vars = c("locid", 
"stdate", "sttime", "charnam", "valunit"), drop = TRUE, .Names = c("locid", 
"stdate", "sttime", "charnam", "valunit")))

4 个答案:

答案 0 :(得分:3)

我们可以做到

library(data.table)
unique(setDT(data_concern_join2), 
       by = c('locid', 'stdate', 'sttime', 'charnam', 'valunit'))

答案 1 :(得分:1)

这对您有用吗?

dat1<-data_concern_join2%>%
 group_by(locid,stdate,sttime,charnam,valunit)%>%
  mutate(count = 1:n()) %>% # the number should increase with each replicated row
  filter(count == 1)  # filter only the first and remove the others

输出:

# A tibble: 9 x 12
# Groups:   locid, stdate, sttime, charnam, valunit [9]
  orgid   locid         stdate     sttime charnam                val valunit    swqs     WMA  year HUC14             count
  <chr>   <chr>         <date>     <time> <chr>                <dbl> <chr>      <chr>  <int> <int> <chr>             <int>
1 USGS-NJ USGS-01396030 2001-02-06 09:30  Specific conductance  7450 uS/cm @25C FW2-NT     7  2001 HUC02030104050090     1
2 USGS-NJ USGS-01389850 2001-02-06 10:00  Specific conductance  3210 uS/cm @25C FW2-NT     4  2001 HUC02030103120050     1
3 USGS-NJ USGS-01391500 2015-03-10 11:15  Specific conductance  1930 uS/cm @25C FW2-NT     4  2015 HUC02030103140070     1
4 USGS-NJ USGS-01391500 2014-02-11 11:00  Specific conductance  1740 uS/cm @25C FW2-NT     4  2014 HUC02030103140070     1
5 USGS-NJ USGS-01393960 2001-02-14 10:00  Specific conductance  1480 uS/cm @25C FW2-NT     7  2001 HUC02030104050010     1
6 USGS-NJ USGS-01390610 2007-12-13 11:00  Specific conductance  1390 uS/cm @25C FW2-NT     4  2007 HUC02030103140010     1
7 USGS-NJ USGS-01391500 2003-12-15 09:00  Specific conductance  1380 uS/cm @25C FW2-NT     4  2003 HUC02030103140070     1
8 USGS-NJ USGS-01391500 2014-09-12 10:40  Specific conductance  1300 uS/cm @25C FW2-NT     4  2014 HUC02030103140070     1
9 USGS-NJ USGS-01378760 2005-08-11 10:15  Specific conductance  1280 uS/cm @25C FW2-NT     6  2005 HUC02030103010040     1

答案 2 :(得分:1)

dplyr的{​​{1}}专为此目的

distinct

答案 3 :(得分:1)

首先,一个简单的问题是删除一列中重复的整个行:

library(dplyr)

df <- tibble(x=c(1,1:4), y=letters[1:5])

df
#> # A tibble: 5 x 2
#>       x y    
#>   <dbl> <chr>
#> 1     1 a    
#> 2     1 b    
#> 3     2 c    
#> 4     3 d    
#> 5     4 e   

df %>% 
  group_by(x) %>%
  mutate(rank=row_number(x)) %>%
  filter(rank==1)
  # optional: %>% ungroup() %>% select(-rank)
#> # A tibble: 4 x 3
#> # Groups:   x [4]
#>       x y      rank
#>   <dbl> <fct> <int>
#> 1     1 a         1
#> 2     2 c         1
#> 3     3 d         1
#> 4     4 e         1

如果您的数据库支持窗口函数,但用dplyr动词表示,这是SQL中的一种常见模式。这是Redshift / PostgreSQL表单:

select * from df where 1 = row_number() over (partition by x)

因此,如果您只需要重复项(x被重复的行),则只需将rank==1替换为rank>1

df %>% 
  group_by(x) %>%
  mutate(rank=row_number(x)) %>%
  filter(rank>1)

#> # A tibble: 1 x 3
#> # Groups:   x [1]
#>       x y      rank
#>   <dbl> <fct> <int>
#> 1     1 b         2

上面没有显示每个x的原始行,只是重复的行。如果您想要所有行,那么我认为其他答案都可以解决,但我的版本是:

x_dupes <- 
  df %>% 
  group_by(x) %>%
  summarise(n=n()) %>%
  filter(n>1) 

df %>% semi_join(x_dupes, by = "x")

#> # A tibble: 2 x 2
#>       x y    
#>   <dbl> <chr>
#> 1     1 a    
#> 2     1 b  

最后,您可能还关心行的顺序,因为您可能对重复项有一个固定的看法。在此示例中,对于重复的y,我按字母顺序保留了x的最后一个值。

df %>% 
  group_by(x) %>%
  arrange(desc(y)) %>%
  mutate(rank=row_number(x)) %>%
  filter(rank==1)

#> # A tibble: 4 x 3
#> # Groups:   x [4]
#>       x y      rank
#>   <dbl> <fct> <int>
#> 1     4 e         1
#> 2     3 d         1
#> 3     2 c         1
#> 4     1 b         1

请注意,与上述第二个示例相比,ya,现在在重复行中为b。行的顺序也相反。