我有数据框我想找到具有相应值的公共列名
Project
输出
202055_at s100.Probeset,s101.Probeset,s102.Probeset,s103.Probeset,s10.Probeset
219957_at s102.Probeset,s103.Probeset,s10.Probeset
203063_at s100.Probeset,s10.Probeset
答案 0 :(得分:0)
我们可以从数据集中获取unique
个元素
un1 <- as.character(unique(unlist(df1)))
检查是否在多于1列中找到它,然后paste
out <- unlist(lapply(un1, function(x) {
x1 <- colSums(df1 == x)
if(sum(x1) > 1) paste(x, paste(names(x1)[x1>0], collapse="."))}))
out[1:2]
#[1] "202055_at s100.Probeset.s101.Probeset.s102.Probeset.s10.Probeset"
#[2] "203063_at s100.Probeset.s10.Probeset"
答案 1 :(得分:0)
只是为了扩展@ akrun的答案。
我不确定你要做什么,但是二进制矩阵表示哪个探针组在哪个列中可能对进一步处理有用。
# Your sample data
probes <- structure(list(s100.Probeset = structure(c(1L, 2L, 4L, 3L, 5L
), .Label = c("202055_at", "203063_at", "206284_x_at", "210734_x_at",
"221915_s_at"), class = "factor"), s101.Probeset = structure(c(2L,
4L, 3L, 1L, 5L), .Label = c("202055_at", "203248_at ", "206284_x_at",
"210734_x_at", "212522_at"), class = "factor"), s102.Probeset = structure(c(1L,
3L, 4L, 5L, 2L), .Label = c("202055_at", "205453_at", "210734_x_at",
"219957_at", "220661_s_at"), class = "factor"), s10.Probeset = structure(c(5L,
1L, 2L, 3L, 4L), .Label = c("202055_at", "203063_at", "211503_s_at",
"214689_at", "219957_at"), class = "factor")), .Names = c("s100.Probeset",
"s101.Probeset", "s102.Probeset", "s10.Probeset"), row.names = c(NA,
-5L), class = "data.frame")
# Unique probesets
unique.probes <- unique(unlist(probes));
# Binary matrix
df1 <- data.frame(
apply(probes, 2, function(x) unique.probes %in% x),
row.names = unique.probes);
df1;
# s100.Probeset s101.Probeset s102.Probeset s10.Probeset
#202055_at TRUE TRUE TRUE TRUE
#203063_at TRUE FALSE FALSE TRUE
#210734_x_at TRUE TRUE TRUE FALSE
#206284_x_at TRUE TRUE FALSE FALSE
#221915_s_at TRUE FALSE FALSE FALSE
#203248_at FALSE TRUE FALSE FALSE
#212522_at FALSE TRUE FALSE FALSE
#219957_at FALSE FALSE TRUE TRUE
#220661_s_at FALSE FALSE TRUE FALSE
#205453_at FALSE FALSE TRUE FALSE
#211503_s_at FALSE FALSE FALSE TRUE
#214689_at FALSE FALSE FALSE TRUE
然后,您可以将圈子关闭到@ akrun的解决方案。
# IDs per probeset
df2 <- cbind.data.frame(
probeset = rownames(df1),
ID = sapply(1:nrow(df1), function(x)
paste(colnames(df1)[which(df1[x, ] == 1)], collapse = ",")));
df2;
# probeset ID
#1 202055_at s100.Probeset,s101.Probeset,s102.Probeset,s10.Probeset
#2 203063_at s100.Probeset,s10.Probeset
#3 210734_x_at s100.Probeset,s101.Probeset,s102.Probeset
#4 206284_x_at s100.Probeset,s101.Probeset
#5 221915_s_at s100.Probeset
#6 203248_at s101.Probeset
#7 212522_at s101.Probeset
#8 219957_at s102.Probeset,s10.Probeset
#9 220661_s_at s102.Probeset
#10 205453_at s102.Probeset
#11 211503_s_at s10.Probeset
#12 214689_at s10.Probeset