删除二元数据集中的重复项(R)

时间:2015-07-27 16:44:34

标签: r

我有一个包含309,888个观测值和121个变量的数据框。我想删除复制二元组的实例。

数据

 D1 <- data.frame(row = c(1, 2, 3, 4, 5, 6, 7 , 8),
           country = c("China", "China", "China", "China", "Myanmar", "Myanmar", "Myanmar", "Myanmar"),
           year = c(1990, 1990, 1990, 1991, 1990, 1990, 1990, 1991), 
           group_a = c("Tibetan", "Tibetan", "Han", "Tibetan", "Karens", "Bamar", "Bamar", "Bamar"), 
           group_b = c("Han", "Manchu", "Tibetan", "Han", "Bamar", "Shan", "Karens", "Karens"), 
           var1= c(0, 0, 0, 0, 0, 0, 0, 0),
                   var2 = c(0, 0, 0, 0, 0, 0, 0, 0))

看起来像

  row country year group_a group_b var1 var2
1   1   China 1990 Tibetan     Han    0    0
2   2   China 1990 Tibetan  Manchu    0    0
3   3   China 1990     Han Tibetan    0    0
4   4   China 1991 Tibetan     Han    0    0
5   5 Myanmar 1990  Karens   Bamar    0    0
6   6 Myanmar 1990   Bamar    Shan    0    0
7   7 Myanmar 1990   Bamar  Karens    0    0
8   8 Myanmar 1991   Bamar  Karens    0    0

在此表中,我想删除第3行和第7行,因为第3行中列'group_a'和'group b'的组合与第1行相同,第7行和第5行也是如此。

所需输出

  row country year group_a group_b var1 var2
1   1   China 1990 Tibetan     Han    0    0
2   2   China 1990 Tibetan  Manchu    0    0
4   4   China 1991 Tibetan     Han    0    0
5   5 Myanmar 1990  Karens   Bamar    0    0
6   6 Myanmar 1990   Bamar    Shan    0    0
8   8 Myanmar 1991   Bamar  Karens    0    0

有关如何执行此操作的任何建议吗?

2 个答案:

答案 0 :(得分:1)

使用data.table:

library(data.table)
setDT(D1)
setkey(D1,row)
D1[D1[, list(country,year,min(group_a,group_b),max(group_a,group_b)), by = row][, list(row = min(row)), by = c("country","year","V3","V4")][, row]]

对于因素,您可以来回更改类型(查看as.character),或将调用调整为:

D1[D1[, list(country,year,min(as.character(group_a),as.character(group_b)),max(as.character(group_a),as.character(group_b))), by = row][, list(row = min(row)), by = c("country","year","V3","V4")][, row]]

答案 1 :(得分:1)

你可以这样做

使用dplyr

library(dplyr)

D1[,c("group_a", "group_b")] = lapply(D1[,c("group_a", "group_b")], as.character)

D1 %>% 
   rowwise() %>% 
   mutate(tmp = paste(sort(c(group_a,group_b)), collapse = '')) %>%  
      group_by(country, year) %>% 
      distinct(tmp) %>% 
      select(-tmp)

#Source: local data frame [6 x 7]
#Groups: country, year

#  row country year group_a group_b var1 var2
#1   1   China 1990 Tibetan     Han    0    0
#2   2   China 1990 Tibetan  Manchu    0    0
#3   4   China 1991 Tibetan     Han    0    0
#4   5 Myanmar 1990  Karens   Bamar    0    0
#5   6 Myanmar 1990   Bamar    Shan    0    0
#6   8 Myanmar 1991   Bamar  Karens    0    0

使用data.table

library(data.table)
setDT(D1)[, c("group_a", "group_b") := lapply(.SD, as.character),
                             .SDcols = c('group_a', 'group_b')]

out = unique(D1[,tmp := paste(sort(c(group_a,group_b)), collapse = ''), 
                           by = row], by = c("tmp", "country", "year"))
out[,!"tmp", with = FALSE]

#   row country year group_a group_b var1 var2
#1:   1   China 1990 Tibetan     Han    0    0
#2:   2   China 1990 Tibetan  Manchu    0    0
#3:   4   China 1991 Tibetan     Han    0    0
#4:   5 Myanmar 1990  Karens   Bamar    0    0
#5:   6 Myanmar 1990   Bamar    Shan    0    0
#6:   8 Myanmar 1991   Bamar  Karens    0    0

使用* apply函数的基础R替代

D1[,c("group_a", "group_b")] = lapply(D1[,c("group_a", "group_b")], as.character)

D1$tmp = apply(D1[,c("group_a","group_b")], 1, 
         function(x) paste(sort(c(x[1], x[2])), collapse = ""))

do.call(rbind, lapply(split(D1, list(D1$country, D1$year)),
         function(x){ out = x[!duplicated(x$tmp),]; out[,-8]}))