我有两个数据集df1和df2。如何从df1中删除df2中的基因名称。
df1< -
public static bool Contains(this string source, string toCheck, StringComparison comp)
{
return source.IndexOf(toCheck, comp) >= 0;
}
df2< -
chr start end CNA Genes No.of.Gene
1 13991 1401 gain Cfh,Gm26048,Bhis,Sclm 2
1 14011 1490 gain Zfp788,Rik 2
预期产出
Genes
Gm26048
Gif
Tl2
Rik
答案 0 :(得分:3)
你可以使用,
df1$Genes <- sapply(strsplit(as.character(df1$Genes), ','), function(i)
setdiff(i, df2$Genes))
df1
# chr start end CNA Genes No.of.Gene
#1 1 13991 1401 gain Cfh 2
#2 1 14011 1490 gain Zfp788 2
编辑
更改df1
后,获得预期结果
sapply(sapply(strsplit(as.character(df1$Genes), ','), function(i)
setdiff(i, df2$Genes)), paste, collapse = ',')
#[1] "Cfh,Bhis,Sclm" "Zfp788"
答案 1 :(得分:3)
另一种选择是使用gsub
df1$Genes <- gsub(",(?=,)|,$", "", gsub(paste0("(", paste(df2$Genes,
collapse="|"), ")"), "", df1$Genes), perl= TRUE)
df1$Genes
#[1] "Cfh,Bhis,Sclm" "Zfp788"
答案 2 :(得分:3)
我们可以将Genes列转换为行,然后使用filter:
#data
df1 <- read.table(text = "
chr start end CNA Genes No.of.Gene
1 13991 1401 gain Cfh,Gm26048,Bhis,Sclm 2
1 14011 1490 gain Zfp788,Rik 2", header = TRUE)
df2 <- read.table(text = "
Genes
Gm26048
Gif
Tl2
Rik", header = TRUE)
library(dplyr)
library(tidyr)
# filter matching genes - intersect
df1 %>%
mutate(Gene = strsplit(as.character(Genes), ",")) %>%
unnest(Gene) %>%
filter(Gene %in% df2$Genes)
# chr start end CNA Genes No.of.Gene Gene
# (int) (int) (int) (fctr) (fctr) (int) (chr)
# 1 1 13991 1401 gain Cfh,Gm26048,Bhis,Sclm 2 Gm26048
# 2 1 14011 1490 gain Zfp788,Rik 2 Rik
# filter non-matching genes - setdiff
df1 %>%
mutate(Gene = strsplit(as.character(Genes), ",")) %>%
unnest(Gene) %>%
filter(!Gene %in% df2$Genes)
# chr start end CNA Genes No.of.Gene Gene
# (int) (int) (int) (fctr) (fctr) (int) (chr)
# 1 1 13991 1401 gain Cfh,Gm26048,Bhis,Sclm 2 Cfh
# 2 1 13991 1401 gain Cfh,Gm26048,Bhis,Sclm 2 Bhis
# 3 1 13991 1401 gain Cfh,Gm26048,Bhis,Sclm 2 Sclm
# 4 1 14011 1490 gain Zfp788,Rik 2 Zfp788
答案 3 :(得分:0)
模式1(pattn1
)将负责删除df2中列出的基因,而pattn2
将删除任何尾随逗号:
pattn1 <- paste0(df2$Genes, collapse=",?|")
df1$Genes <- str_replace_all(df1$Genes, pattn1, "")
pattn2 <- c("^,|,$")
df1$Genes <- str_replace_all(df1$Genes, pattn2, "")
## Results ##
chr start end CNA Genes No.of.Gene
1 1 13991 1401 gain Cfh,Bhis,Sclm 2
2 1 14011 1490 gain Zfp788 2