使用以下数据框:
df <- data.frame(id=c(1324876,2457387,3136822,4607984,5049365,6395867,7847307,8347562,9283756), name=c("Anne","Jack","Bill","Mary","Bill","Mary","Anne","Jack","Mary"), cond1=c(1,0,0,1,0,0,1,1,0), cond2=c(0,1,0,0,1,1,1,0,0))
> df[order(df$name),]
id name cond1 cond2
1 1324876 Anne 1 0
7 7847307 Anne 1 1
3 3136822 Bill 0 0
5 5049365 Bill 0 1
2 2457387 Jack 0 1
8 8347562 Jack 1 0
4 4607984 Mary 1 0
6 6395867 Mary 0 1
9 9283756 Mary 0 0
假设我只希望每个名称保留一行。我将根据以下优先顺序选择要保留的行:
现在,我没有在要保留的行上添加keep
标志,而是如何为要删除的每个重复项创建一个keep_instead
列,其中包含将保留的行?在这种情况下,我们将得到以下结果:
id
答案 0 :(得分:2)
library(dplyr)
df %>%
group_by(name) %>%
arrange(desc(cond1 * 2 + cond2)) %>% #more weight to cond1
mutate(keep_instead = if_else(row_number() > 1, first(id), NaN)) %>%
ungroup() %>%
arrange(name)
#> # A tibble: 9 x 5
#> id name cond1 cond2 keep_instead
#> <dbl> <fct> <dbl> <dbl> <dbl>
#> 1 7847307 Anne 1 1 NaN
#> 2 1324876 Anne 1 0 7847307
#> 3 5049365 Bill 0 1 NaN
#> 4 3136822 Bill 0 0 5049365
#> 5 8347562 Jack 1 0 NaN
#> 6 2457387 Jack 0 1 8347562
#> 7 4607984 Mary 1 0 NaN
#> 8 6395867 Mary 0 1 4607984
#> 9 9283756 Mary 0 0 4607984
答案 1 :(得分:0)
我们可以使用
library(dplyr)
library(stringr)
df %>%
arrange(name) %>%
mutate(rn = row_number()) %>%
arrange(name, factor(str_c(cond1, cond2),
levels = c('11', '10', '01', '00'))) %>%
group_by(name) %>%
mutate(keep_instead = first(id)) %>%
arrange(name) %>%
mutate(keep_instead = replace(keep_instead, !duplicated(keep_instead), NA))
# A tibble: 9 x 6
# Groups: name [4]
# id name cond1 cond2 rn keep_instead
# <dbl> <fct> <dbl> <dbl> <int> <dbl>
#1 7847307 Anne 1 1 2 NA
#2 1324876 Anne 1 0 1 7847307
#3 5049365 Bill 0 1 4 NA
#4 3136822 Bill 0 0 3 5049365
#5 8347562 Jack 1 0 6 NA
#6 2457387 Jack 0 1 5 8347562
#7 4607984 Mary 1 0 7 NA
#8 6395867 Mary 0 1 8 4607984
#9 9283756 Mary 0 0 9 4607984
答案 2 :(得分:0)
library(data.table)
setDT(df)
df[order(cond1, cond2, decreasing = T)
, keep_instead := ifelse(1:.N > 1, id[1], NA)
, by = name][order(name)]
# id name cond1 cond2 keep_instead
# 1: 1324876 Anne 1 0 7847307
# 2: 7847307 Anne 1 1 NA
# 3: 3136822 Bill 0 0 5049365
# 4: 5049365 Bill 0 1 NA
# 5: 2457387 Jack 0 1 8347562
# 6: 8347562 Jack 1 0 NA
# 7: 4607984 Mary 1 0 NA
# 8: 6395867 Mary 0 1 4607984
# 9: 9283756 Mary 0 0 4607984