我有一个包含三列的数据,如
Inputdf<-structure(list(df1 = structure(c(4L, 5L, 2L, 1L, 3L), .Label = c("P61160,P61158,O15143,O15144,O15145,P59998,O15511",
"P78537,Q6QNY1,Q6QNY0", "Q06323,Q9UL46", "Q92793,Q09472,Q9Y6Q9,Q92831",
"Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"), class = "factor"),
df2 = structure(c(3L, 2L, 5L, 4L, 1L), .Label = c("", "P61158,O15143,O15144",
"Q06323,Q9UL46", "Q6QNY0", "Q92828"), class = "factor"),
df3 = structure(c(5L, 4L, 3L, 2L, 1L), .Label = c("", "O15511",
"Q06323,Q9UL46", "Q6QNY0", "Q92793,Q09472"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -5L))
我试图在此数据中找到类似的字符串,例如
在df1
中,我有第一行Q92793,Q09472,Q9Y6Q9,Q92831
然后我看看df2和df3,看看是否有任何这些成员在那里然后在这个例子中,我做了以下数据
df1 df2 df3 Numberdf1 df2 df3
1 0 1 4 0 Q92793,Q09472
df1 1表示df1的第一行
df2 0表示它没有任何相似性
df3 1表示df3的第一行与df1第1行具有相似性
Numberdf1,它是由,
分隔的字符串数,为4
df2为0,因为没有任何类似的字符串协议df2
df3是Q92793,Q09472,它粘贴了这里类似的字符串
欲望输出如下所示
out<- structure(list(df1 = 1:5, df2 = c(0L, 3L, 4L, 2L, 1L), df3 = c(1L,
0L, 2L, 4L, 3L), Numberdf1 = c(4L, 6L, 2L, 7L, 2L), df2.1 = structure(c(1L,
5L, 4L, 2L, 3L), .Label = c("0", "P61158,O15143,O15144", "Q06323,Q9UL46",
"Q6QNY0", "Q92828"), class = "factor"), df3.1 = structure(c(5L,
1L, 4L, 2L, 3L), .Label = c("0", "O15511", "Q06323,Q9UL46", "Q6QNY0",
"Q92793,Q09472"), class = "factor")), .Names = c("df1", "df2",
"df3", "Numberdf1", "df2.1", "df3.1"), class = "data.frame", row.names = c(NA,
-5L))
以下功能不起作用,例如,将此数据用作输入
Inputdf1<- structure(list(df1 = structure(c(2L, 3L, 1L), .Label = c("Q06323,Q9UL46",
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"
), class = "factor"), df2 = structure(1:3, .Label = c("P25788,P25789",
"Q92828, O60907, O75376", "Q9UL46, Q06323"), class = "factor"),
df3 = structure(c(2L, 1L, 3L), .Label = c("Q92831, Q92793, Q09472",
"Q9BZK7, Q92828, O75376, O60907", "Q9UL46, Q06323"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -3L))
答案 0 :(得分:1)
这适用于您的示例:
# First convert factors to strings to lists
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,','))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = Position(not.empty, df2.intersect, nomatch=0)
out[r, 'df3'] = Position(not.empty, df3.intersect, nomatch=0)
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(Find(not.empty, df2.intersect, nomatch=0), collapse=',')
out[r, 'df3.1'] = paste(Find(not.empty, df3.intersect, nomatch=0), collapse=',')
}
out
# df1 df2 df3 Numberdf1 df2.1 df3.1
# 1 1 0 1 4 0 Q92793,Q09472
# 2 2 3 0 6 Q92828 0
# 3 3 4 2 3 Q6QNY0 Q6QNY0
# 4 4 2 4 7 P61158,O15143,O15144 O15511
# 5 5 1 3 2 Q06323,Q9UL46 Q06323,Q9UL46
注意:Find
和Position
仅识别第一个匹配。如果可能存在多个匹配项,请使用which
。
修改强>
多个匹配的版本会计
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,',\\s*'))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = paste(which(sapply(df2.intersect, not.empty)), collapse=',')
out[r, 'df3'] = paste(which(sapply(df3.intersect, not.empty)), collapse=',')
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(unique(unlist(df2.intersect)), collapse=',')
out[r, 'df3.1'] = paste(unique(unlist(df3.intersect)), collapse=',')
}
out[out==""] = "0"