通过比较两个数据帧来过滤常见字符串

时间:2016-06-14 08:40:17

标签: r dataframe data.table bioinformatics

我有两个数据集df1和df2。如何从df1中删除df2中的基因名称。

df1< -

public static bool Contains(this string source, string toCheck, StringComparison comp)
{
   return source.IndexOf(toCheck, comp) >= 0;
}

df2< -

chr   start   end     CNA       Genes                  No.of.Gene
   1    13991   1401    gain    Cfh,Gm26048,Bhis,Sclm       2
   1    14011   1490    gain    Zfp788,Rik                  2

预期产出

       Genes
      Gm26048
        Gif
        Tl2
        Rik

4 个答案:

答案 0 :(得分:3)

你可以使用,

df1$Genes <- sapply(strsplit(as.character(df1$Genes), ','), function(i)
                                                     setdiff(i, df2$Genes))

df1
#  chr start  end  CNA  Genes No.of.Gene
#1   1 13991 1401 gain    Cfh          2
#2   1 14011 1490 gain Zfp788          2

编辑

更改df1后,获得预期结果

sapply(sapply(strsplit(as.character(df1$Genes), ','), function(i) 
                              setdiff(i, df2$Genes)), paste, collapse = ',')
#[1] "Cfh,Bhis,Sclm" "Zfp788"

答案 1 :(得分:3)

另一种选择是使用gsub

df1$Genes <- gsub(",(?=,)|,$", "", gsub(paste0("(", paste(df2$Genes, 
              collapse="|"), ")"), "", df1$Genes), perl= TRUE)
df1$Genes
#[1] "Cfh,Bhis,Sclm" "Zfp788"  

答案 2 :(得分:3)

我们可以将Genes列转换为行,然后使用filter:

#data
df1 <- read.table(text = "
chr   start   end     CNA       Genes                  No.of.Gene
1    13991   1401    gain    Cfh,Gm26048,Bhis,Sclm       2
1    14011   1490    gain    Zfp788,Rik                  2", header = TRUE)
df2 <- read.table(text = "
Genes
Gm26048
Gif
Tl2
Rik", header = TRUE)

library(dplyr)
library(tidyr)

# filter matching genes - intersect    
df1 %>% 
  mutate(Gene = strsplit(as.character(Genes), ",")) %>%
  unnest(Gene) %>% 
  filter(Gene %in% df2$Genes)

#     chr start   end    CNA                 Genes No.of.Gene    Gene
#   (int) (int) (int) (fctr)                (fctr)      (int)   (chr)
# 1     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2 Gm26048
# 2     1 14011  1490   gain            Zfp788,Rik          2     Rik

# filter non-matching genes - setdiff
df1 %>% 
  mutate(Gene = strsplit(as.character(Genes), ",")) %>%
  unnest(Gene) %>% 
  filter(!Gene %in% df2$Genes)

#     chr start   end    CNA                 Genes No.of.Gene   Gene
#    (int) (int) (int) (fctr)                (fctr)      (int)  (chr)
# 1     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2    Cfh
# 2     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2   Bhis
# 3     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2   Sclm
# 4     1 14011  1490   gain            Zfp788,Rik          2 Zfp788

答案 3 :(得分:0)

模式1(pattn1)将负责删除df2中列出的基因,而pattn2将删除任何尾随逗号:

pattn1 <- paste0(df2$Genes, collapse=",?|")
df1$Genes <- str_replace_all(df1$Genes, pattn1, "")
pattn2 <- c("^,|,$")
df1$Genes <- str_replace_all(df1$Genes, pattn2, "")

## Results ##

  chr start  end  CNA         Genes No.of.Gene
1   1 13991 1401 gain Cfh,Bhis,Sclm          2
2   1 14011 1490 gain        Zfp788          2