我想计算第一列中每行中的字符串数与string2相似。
df1<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L, 1L, 1L,
1L), .Label = c("gdyijq,udyhfs,gqdtr", "hdydg", "hishsgd,gugddf",
"ydis"), class = "factor")), .Names = "string1", class = "data.frame", row.names = c(NA,
-7L))
df2<- structure(list(string2 = structure(c(3L, 1L, 4L, 2L), .Label = c("0",
"gqdtr", "hishsgd,gugddf", "ydis"), class = "factor")), .Names = "string2", class = "data.frame", row.names = c(NA,
-4L))
我尝试将这两个绑定为dfM
而没有成功
dfM <- cbind(df1,df2)
df1看起来像 STRING1 1 hishsgd,gugddf 2 hdydg 3 ydis 4 gdyijq,udyhfs,gqdtr 5 gdyijq,udyhfs,gqdtr 6 gdyijq,udyhfs,gqdtr 7 gdyijq,udyhfs,gqdtr
和df2,看起来像
string2
1 hishsgd,gugddf
2 0
3 ydis
4 gqdtr
我希望像这样
dfN<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L, 1L, 1L,
1L), .Label = c("gdyijq,udyhfs,gqdtr", "hdydg", "hishsgd,gugddf",
"ydis"), class = "factor"), string2 = structure(c(4L, 2L, 5L,
3L, 1L, 1L, 1L), .Label = c("", "0", "gqdtr", "hishsgd,gugddf",
"ydis"), class = "factor")), .Names = c("string1", "string2"), class = "data.frame", row.names = c(NA,
-7L))
################## second part ###############
,第二部分是
dfN<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L), .Label = c("gdyijq,udyhfs,gqdtr",
"hdydg", "hishsgd,gugddf", "ydis"), class = "factor"), string2 = structure(c(3L,
1L, 4L, 2L), .Label = c("0", "gqdtr", "hishsgd,gugddf", "ydis"
), class = "factor")), .Names = c("string1", "string2"), class = "data.frame", row.names = c(NA,
-4L))
例如在第一行
string1 string2
hishsgd,gugddf hishsgd,gugddf
所以它应该是2
在第二行
string1 string2
hdydg 0
它们不相似,应为0然后
等等,预期输出如下
renew<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L), .Label = c("gdyijq,udyhfs,gqdtr",
"hdydg", "hishsgd,gugddf", "ydis"), class = "factor"), string2 = structure(c(3L,
1L, 4L, 2L), .Label = c("0", "gqdtr", "hishsgd,gugddf", "ydis"
), class = "factor"), similar = c(2L, 0L, 1L, 1L)), .Names = c("string1",
"string2", "similar"), class = "data.frame", row.names = c(NA,
-4L))
答案 0 :(得分:2)
我们可以使用strsplit
拆分每列中的字符串,使用list
和intersect
获取每个Map
元素的公共元素,然后找到length
与lengths
lst <- lapply(dfN, function(x) strsplit(as.character(x), ","))
renew1 <- transform(dfN, similar = lengths(Map(intersect, lst[[1]], lst[[2]])))
identical(renew, renew1)
#[1] TRUE
答案 1 :(得分:1)
或者您可以使用%in%
进行匹配
dfN<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L), .Label = c("gdyijq,udyhfs,gqdtr",
"hdydg", "hishsgd,gugddf", "ydis"), class = "factor"), string2 = structure(c(3L,
1L, 4L, 2L), .Label = c("0", "gqdtr", "hishsgd,gugddf", "ydis"
), class = "factor")), .Names = c("string1", "string2"), class = "data.frame", row.names = c(NA,
-4L))
renew<- structure(list(string1 = structure(c(3L, 2L, 4L, 1L), .Label = c("gdyijq,udyhfs,gqdtr",
"hdydg", "hishsgd,gugddf", "ydis"), class = "factor"), string2 = structure(c(3L,
1L, 4L, 2L), .Label = c("0", "gqdtr", "hishsgd,gugddf", "ydis"
), class = "factor"), similar = c(2L, 0L, 1L, 1L)), .Names = c("string1",
"string2", "similar"), class = "data.frame", row.names = c(NA,
-4L))
dfN
renew
# use strsplit to break up the cell values
col1<- strsplit(as.character(dfN$string1),",")
col2<- strsplit(as.character(dfN$string2),",")
#use %in% to find match
res<- mapply(FUN="%in%", col1, col2)
#sum up the TRUE values
res2<- lapply(res,sum)
# merge the result
resultDF<- data.frame(dfN, newcol= unlist(res2))
#test
resultDF== renew #data.frame(dfN, newcol= 1:4 )