在两列中找到相似的字符串

时间:2016-12-11 13:39:32

标签: r string

我想计算第一列中每行中的字符串数与string2相似。

df1<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L, 1L, 1L, 
1L), .Label = c("gdyijq,udyhfs,gqdtr", "hdydg", "hishsgd,gugddf", 
"ydis"), class = "factor")), .Names = "string1", class = "data.frame", row.names = c(NA, 
-7L))

df2<- structure(list(string2 = structure(c(3L, 1L, 4L, 2L), .Label = c("0", 
"gqdtr", "hishsgd,gugddf", "ydis"), class = "factor")), .Names = "string2", class = "data.frame", row.names = c(NA, 
-4L))

我尝试将这两个绑定为dfM而没有成功

dfM <- cbind(df1,df2)

df1看起来像                   STRING1     1 hishsgd,gugddf     2 hdydg     3 ydis     4 gdyijq,udyhfs,gqdtr     5 gdyijq,udyhfs,gqdtr     6 gdyijq,udyhfs,gqdtr     7 gdyijq,udyhfs,gqdtr

和df2,看起来像

        string2
1 hishsgd,gugddf
2              0
3           ydis
4          gqdtr

我希望像这样

dfN<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L, 1L, 1L, 
1L), .Label = c("gdyijq,udyhfs,gqdtr", "hdydg", "hishsgd,gugddf", 
"ydis"), class = "factor"), string2 = structure(c(4L, 2L, 5L, 
3L, 1L, 1L, 1L), .Label = c("", "0", "gqdtr", "hishsgd,gugddf", 
"ydis"), class = "factor")), .Names = c("string1", "string2"), class = "data.frame", row.names = c(NA, 
-7L))


################## second part ###############

,第二部分是

 dfN<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L), .Label = c("gdyijq,udyhfs,gqdtr", 
    "hdydg", "hishsgd,gugddf", "ydis"), class = "factor"), string2 = structure(c(3L, 
    1L, 4L, 2L), .Label = c("0", "gqdtr", "hishsgd,gugddf", "ydis"
    ), class = "factor")), .Names = c("string1", "string2"), class = "data.frame", row.names = c(NA, 
    -4L))

例如在第一行

string1            string2
hishsgd,gugddf    hishsgd,gugddf

所以它应该是2

在第二行

string1            string2
hdydg                 0

它们不相似,应为0然后

等等,预期输出如下

renew<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L), .Label = c("gdyijq,udyhfs,gqdtr", 
"hdydg", "hishsgd,gugddf", "ydis"), class = "factor"), string2 = structure(c(3L, 
1L, 4L, 2L), .Label = c("0", "gqdtr", "hishsgd,gugddf", "ydis"
), class = "factor"), similar = c(2L, 0L, 1L, 1L)), .Names = c("string1", 
"string2", "similar"), class = "data.frame", row.names = c(NA, 
-4L))

2 个答案:

答案 0 :(得分:2)

我们可以使用strsplit拆分每列中的字符串,使用listintersect获取每个Map元素的公共元素,然后找到lengthlengths

lst <- lapply(dfN, function(x) strsplit(as.character(x), ","))
renew1 <- transform(dfN, similar = lengths(Map(intersect, lst[[1]], lst[[2]])))
identical(renew, renew1)
#[1] TRUE

答案 1 :(得分:1)

或者您可以使用%in%进行匹配

dfN<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L), .Label = c("gdyijq,udyhfs,gqdtr", 
    "hdydg", "hishsgd,gugddf", "ydis"), class = "factor"), string2 = structure(c(3L, 
    1L, 4L, 2L), .Label = c("0", "gqdtr", "hishsgd,gugddf", "ydis"
    ), class = "factor")), .Names = c("string1", "string2"), class = "data.frame", row.names = c(NA, 
    -4L))
renew<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L), .Label = c("gdyijq,udyhfs,gqdtr", 
"hdydg", "hishsgd,gugddf", "ydis"), class = "factor"), string2 = structure(c(3L, 
1L, 4L, 2L), .Label = c("0", "gqdtr", "hishsgd,gugddf", "ydis"
), class = "factor"), similar = c(2L, 0L, 1L, 1L)), .Names = c("string1", 
"string2", "similar"), class = "data.frame", row.names = c(NA, 
-4L))

dfN
renew

# use strsplit to break up the cell values
col1<- strsplit(as.character(dfN$string1),",")
col2<- strsplit(as.character(dfN$string2),",")

#use %in% to find match
res<- mapply(FUN="%in%", col1, col2)

#sum up the TRUE values
res2<- lapply(res,sum)

# merge the result
resultDF<- data.frame(dfN, newcol= unlist(res2))

#test
resultDF==  renew  #data.frame(dfN, newcol= 1:4  )