合并和计算类似的字符串

时间:2016-12-11 04:36:58

标签: r string

我有一个包含三列的数据,如

Inputdf<-structure(list(df1 = structure(c(4L, 5L, 2L, 1L, 3L), .Label = c("P61160,P61158,O15143,O15144,O15145,P59998,O15511", 
"P78537,Q6QNY1,Q6QNY0", "Q06323,Q9UL46", "Q92793,Q09472,Q9Y6Q9,Q92831", 
"Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"), class = "factor"), 
    df2 = structure(c(3L, 2L, 5L, 4L, 1L), .Label = c("", "P61158,O15143,O15144", 
    "Q06323,Q9UL46", "Q6QNY0", "Q92828"), class = "factor"), 
    df3 = structure(c(5L, 4L, 3L, 2L, 1L), .Label = c("", "O15511", 
    "Q06323,Q9UL46", "Q6QNY0", "Q92793,Q09472"), class = "factor")), .Names = c("df1", 
"df2", "df3"), class = "data.frame", row.names = c(NA, -5L))

我试图在此数据中找到类似的字符串,例如

df1中,我有第一行Q92793,Q09472,Q9Y6Q9,Q92831 然后我看看df2和df3,看看是否有任何这些成员在那里然后在这个例子中,我做了以下数据

df1 df2 df3 Numberdf1      df2     df3
1   0   1   4              0      Q92793,Q09472

df1 1表示df1的第一行 df2 0表示它没有任何相似性 df3 1表示df3的第一行与df1第1行具有相似性 Numberdf1,它是由,分隔的字符串数,为4 df2为0,因为没有任何类似的字符串协议df2 df3是Q92793,Q09472,它粘贴了这里类似的字符串

欲望输出如下所示

out<- structure(list(df1 = 1:5, df2 = c(0L, 3L, 4L, 2L, 1L), df3 = c(1L, 
0L, 2L, 4L, 3L), Numberdf1 = c(4L, 6L, 2L, 7L, 2L), df2.1 = structure(c(1L, 
5L, 4L, 2L, 3L), .Label = c("0", "P61158,O15143,O15144", "Q06323,Q9UL46", 
"Q6QNY0", "Q92828"), class = "factor"), df3.1 = structure(c(5L, 
1L, 4L, 2L, 3L), .Label = c("0", "O15511", "Q06323,Q9UL46", "Q6QNY0", 
"Q92793,Q09472"), class = "factor")), .Names = c("df1", "df2", 
"df3", "Numberdf1", "df2.1", "df3.1"), class = "data.frame", row.names = c(NA, 
-5L))

以下功能不起作用,例如,将此数据用作输入

Inputdf1<- structure(list(df1 = structure(c(2L, 3L, 1L), .Label = c("Q06323,Q9UL46", 
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"
), class = "factor"), df2 = structure(1:3, .Label = c("P25788,P25789", 
"Q92828, O60907, O75376", "Q9UL46, Q06323"), class = "factor"), 
    df3 = structure(c(2L, 1L, 3L), .Label = c("Q92831, Q92793, Q09472", 
    "Q9BZK7, Q92828, O75376, O60907", "Q9UL46, Q06323"), class = "factor")), .Names = c("df1", 
"df2", "df3"), class = "data.frame", row.names = c(NA, -3L))

1 个答案:

答案 0 :(得分:1)

这适用于您的示例:

# First convert factors to strings to lists
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,','))))

not.empty = function(x) length(x) > 0
out = data.frame()

for (r in 1:nrow(Inputdf)) {
  df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
  df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])

  out[r, 'df1'] = r
  out[r, 'df2'] = Position(not.empty, df2.intersect, nomatch=0)
  out[r, 'df3'] = Position(not.empty, df3.intersect, nomatch=0)
  out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
  out[r, 'df2.1'] = paste(Find(not.empty, df2.intersect, nomatch=0), collapse=',')
  out[r, 'df3.1'] = paste(Find(not.empty, df3.intersect, nomatch=0), collapse=',')
}

out
#   df1 df2 df3 Numberdf1                df2.1         df3.1
# 1   1   0   1         4                    0 Q92793,Q09472
# 2   2   3   0         6               Q92828             0
# 3   3   4   2         3               Q6QNY0        Q6QNY0
# 4   4   2   4         7 P61158,O15143,O15144        O15511
# 5   5   1   3         2        Q06323,Q9UL46 Q06323,Q9UL46

注意:FindPosition仅识别第一个匹配。如果可能存在多个匹配项,请使用which

修改

多个匹配的版本会计

Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,',\\s*'))))

not.empty = function(x) length(x) > 0
out = data.frame()

for (r in 1:nrow(Inputdf)) {
  df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
  df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])

  out[r, 'df1'] = r
  out[r, 'df2'] = paste(which(sapply(df2.intersect, not.empty)), collapse=',')
  out[r, 'df3'] = paste(which(sapply(df3.intersect, not.empty)), collapse=',')
  out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
  out[r, 'df2.1'] = paste(unique(unlist(df2.intersect)), collapse=',')
  out[r, 'df3.1'] = paste(unique(unlist(df3.intersect)), collapse=',')
}

out[out==""] = "0"