删除R中数据框中重复单元格的所有实例(不是整个行/列)

时间:2018-10-15 00:22:16

标签: r dataframe

我有一个数据框:

genes_1 = c("a","b","c","d","e")
genes_2 = c("f","g","c","e","j")
genes_3 = c("a","b","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)

我想要的输出:

genes_1 = c("","","","d","")
genes_2 = c("f","g","","","j")
genes_3 = c("","","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)

我该如何实现? 谢谢

3 个答案:

答案 0 :(得分:4)

0依赖基R解决方案:

data.frame(
  genes_1 = c("a","b","c","d","e"),
  genes_2 = c("f","g","c","e","j"),
  genes_3 = c("a","b","m","n","o"),
  stringsAsFactors = FALSE
) -> xdf

dups <- names(which(table(unlist(xdf, use.names = FALSE)) > 1))

xdf[] <- lapply(xdf, function(x) { x[x %in% dups] <- "" ; x })

xdf

unlist()将所有列递归展开为单个字符向量。

table()计算每个元素的所有出现次数。

which()缩小到TRUE

names()获取字符选择矢量元素。

然后我们按列工作,以替换向量中与""匹配的所有匹配项

library(microbenchmark)
library(data.table)

microbenchmark(
  base = {
    ydf <- xdf
    dups <- names(which(table(unlist(ydf, use.names = FALSE)) > 1))
    ydf[] <- lapply(ydf, function(x) { x[x %in% dups] <- "" ; x })
  },
  base.2 = {
    ydf <- xdf
    tmp <- unlist(ydf)
    ydf[arrayInd(which(duplicated(tmp) | duplicated(tmp, fromLast = TRUE)), dim(ydf))] <- ""
  },
  tidyverse = {
    ydf <- xdf
    ydf %>%
      gather(genes, value) %>%
      add_count(value) %>%
      mutate(value = ifelse(n > 1, "", value)) %>%
      select(-n) %>%
      group_by(genes) %>%
      mutate(ID = 1:n()) %>%
      spread(genes, value) %>%
      select(-ID) -> ydf
  },
  data.table = {
    ydt <- data.table(xdf)
    ydt[,lapply(.SD, function(x) { x[x %in% dups] <- "" ; x })]
  }
) %>%
  { print(.) ; . } %>% 
  autoplot()

enter image description here

答案 1 :(得分:3)

另一个基本解决方案:

tmp <- unlist(df)
df[arrayInd(which(duplicated(tmp) | duplicated(tmp,fromLast=TRUE)), dim(df))] <- NA

#  genes_1 genes_2 genes_3
#1    <NA>       f    <NA>
#2    <NA>       g    <NA>
#3    <NA>    <NA>       m
#4       d    <NA>       n
#5    <NA>       j       o

unlist只是为df中的所有值创建一个长向量 arrayInd然后创建一个两列的行/列索引,以将df的子集duplicated设置为子集。

答案 2 :(得分:2)

这是一个tidyverse解决方案。 df2是最终输出。

library(tidyverse)

df2 <- df %>%
  gather(genes, value) %>%
  add_count(value) %>%
  mutate(value = ifelse(n > 1, "", value)) %>%
  select(-n) %>%
  group_by(genes) %>%
  mutate(ID = 1:n()) %>%
  spread(genes, value) %>%
  select(-ID)