我不得不承认这对我来说太难了。我必须分析一些数据,这一步对我来说至关重要。
我想分析的数据:
> dput(tbl_clustering)
structure(list(P1 = structure(c(14L, 14L, 6L, 6L, 6L, 19L, 15L,
13L, 13L, 13L, 13L, 10L, 10L, 6L, 6L, 10L, 27L, 27L, 27L, 27L,
27L, 22L, 22L, 22L, 21L, 21L, 21L, 27L, 27L, 27L, 27L, 21L, 21L,
21L, 28L, 28L, 25L, 25L, 25L, 29L, 29L, 17L, 17L, 17L, 5L, 5L,
5L, 5L, 20L, 20L, 23L, 23L, 23L, 23L, 7L, 26L, 26L, 24L, 24L,
24L, 24L, 3L, 3L, 3L, 9L, 8L, 2L, 11L, 11L, 11L, 11L, 11L, 12L,
12L, 4L, 4L, 4L, 1L, 1L, 1L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), .Label = c("AT1G09130",
"AT1G09620", "AT1G10760", "AT1G14610", "AT1G43170", "AT1G58080",
"AT2G27680", "AT2G27710", "AT3G03710", "AT3G05590", "AT3G11510",
"AT3G56130", "AT3G58730", "AT3G61540", "AT4G03520", "AT4G22930",
"AT4G33030", "AT5G01600", "AT5G04710", "AT5G17990", "AT5G19220",
"AT5G43940", "AT5G63310", "ATCG00020", "ATCG00380", "ATCG00720",
"ATCG00770", "ATCG00810", "ATCG00900"), class = "factor"), P2 = structure(c(55L,
54L, 29L, 4L, 70L, 72L, 18L, 9L, 58L, 68L, 19L, 6L, 1L, 16L,
34L, 32L, 77L, 12L, 61L, 41L, 71L, 73L, 50L, 11L, 69L, 22L, 60L,
42L, 47L, 45L, 59L, 30L, 24L, 23L, 77L, 45L, 12L, 47L, 59L, 82L,
75L, 40L, 26L, 83L, 81L, 47L, 36L, 45L, 2L, 65L, 11L, 38L, 13L,
31L, 53L, 78L, 7L, 80L, 79L, 7L, 76L, 17L, 10L, 3L, 68L, 51L,
48L, 62L, 58L, 64L, 68L, 74L, 63L, 14L, 57L, 33L, 56L, 39L, 52L,
35L, 43L, 25L, 27L, 21L, 15L, 5L, 49L, 37L, 66L, 20L, 44L, 69L,
22L, 67L, 57L, 8L, 46L, 28L), .Label = c("AT1G01090", "AT1G02150",
"AT1G03870", "AT1G09795", "AT1G13060", "AT1G14320", "AT1G15820",
"AT1G17745", "AT1G20630", "AT1G29880", "AT1G29990", "AT1G43170",
"AT1G52340", "AT1G52670", "AT1G56450", "AT1G59900", "AT1G69830",
"AT1G75330", "AT1G78570", "AT2G05840", "AT2G28000", "AT2G34590",
"AT2G35040", "AT2G37020", "AT2G40300", "AT2G42910", "AT2G44050",
"AT2G44350", "AT2G45440", "AT3G01500", "AT3G03980", "AT3G04840",
"AT3G07770", "AT3G13235", "AT3G14415", "AT3G18740", "AT3G22110",
"AT3G22480", "AT3G22960", "AT3G51840", "AT3G54210", "AT3G54400",
"AT3G56090", "AT3G60820", "AT4G00100", "AT4G00570", "AT4G02770",
"AT4G11010", "AT4G14800", "AT4G18480", "AT4G20760", "AT4G26530",
"AT4G28750", "AT4G30910", "AT4G30920", "AT4G33760", "AT4G34200",
"AT5G02500", "AT5G02960", "AT5G10920", "AT5G12250", "AT5G13120",
"AT5G16390", "AT5G18380", "AT5G35360", "AT5G35590", "AT5G35630",
"AT5G35790", "AT5G48300", "AT5G52100", "AT5G56030", "AT5G60160",
"AT5G64300", "AT5G67360", "ATCG00160", "ATCG00270", "ATCG00380",
"ATCG00540", "ATCG00580", "ATCG00680", "ATCG00750", "ATCG00820",
"ATCG01110"), class = "factor"), No_Interactions = c(8L, 5L,
5L, 9L, 7L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 5L, 8L, 6L,
5L, 5L, 5L, 5L, 5L, 5L, 10L, 6L, 6L, 5L, 5L, 5L, 5L, 8L, 5L,
5L, 7L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 5L, 5L, 5L, 5L,
6L, 5L, 5L, 6L, 5L, 5L, 6L, 5L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 6L,
5L, 5L, 5L, 5L, 6L, 5L, 5L, 5L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 7L,
8L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 7L, 5L, 5L,
6L)), .Names = c("P1", "P2", "No_Interactions"), class = "data.frame", row.names = c(NA,
-98L))
为了更好地解释我想要实现的目标,我将在此处粘贴一些行:
P1 P2 No_Interactions
1 AT3G61540 AT4G30920 8
2 AT3G61540 AT4G30910 5
3 AT1G58080 AT2G45440 5
4 AT1G58080 AT1G09795 9
5 AT1G58080 AT5G52100 7
6 AT5G04710 AT5G60160 6
7 AT4G03520 AT1G75330 5
8 AT3G58730 AT1G20630 5
9 AT3G58730 AT5G02500 5
10 AT3G58730 AT5G35790 5
首先,必须创建新列Cluster
。接下来,我们只关注两列P1
和P2
。正如您在第一行中看到的,我们有两个名称AT3G61540
和AT4G30920
,这是我们的起点(我相信这是必要的循环)。我们将数字1放在Cluster
列中。我们采用名字AT3G61540
并扫描列P1
和P2
,如果我们再次找到这个名称,而不是第一行,我们在{{{ 1}}。接下来,我们从第一行Cluster
获取第二个名称,并通过整个数据进行相同的筛选。
下一步将分析下一行并完成相同的事情。在下一行的情况下,我们对AT4G30920
具有完全相同的名称,这意味着我们不需要对其进行筛选,但第二个名称P1
是不同的,因此可以很好地筛选一。这里出现的问题是这一行也应该是AT4G30910
。 cluster 1
从第三行开始,因为我们有一对全新的名称。
我知道这不是一件容易的事情,可能需要分几步完成。
编辑: 我想得到的输出:
cluster 2
答案 0 :(得分:3)
您可以使用库igraph
创建一个无向图,您可以在其中对连接的组合进行聚类:
library('igraph')
# make graph and cluster
g = graph.data.frame(tbl_clustering[,c('P1', 'P2')], directed=FALSE)
c = clusters(g)
# append cluster number to original data
tbl_clustering$cluster = sapply(as.vector(tbl_clustering$P1), function(x)c$membership[x])
这会将聚类分配给条目(此处为第一行):
> head(tbl_clustering, 8)
P1 P2 No_Interactions cluster
1 AT3G61540 AT4G30920 8 1
2 AT3G61540 AT4G30910 5 1
3 AT1G58080 AT2G45440 5 2
4 AT1G58080 AT1G09795 9 2
5 AT1G58080 AT5G52100 7 2
6 AT5G04710 AT5G60160 6 3
7 AT4G03520 AT1G75330 5 4
8 AT3G58730 AT1G20630 5 5
答案 1 :(得分:1)
我相信您希望将数据集划分为等价类。我有一个基于Numerical Recipes中的算法的实现。我已经包含了以下代码。它可以使用如下:
source("equivalence.R")
ids <- unique(c(levels(data[[1]]), levels(data[[2]])))
classes <- equivalence(ids, data[1:2])
data$class <- classes[match(data$P1, ids)]
<强> equivalence.R 强>
library(Rcpp)
Rcpp::sourceCpp('equivalence.cpp')
equivalence <- function(x, rules) {
tmp <- unique(x)
tmp <- tmp[!is.na(tmp)]
a <- match(rules[[1]], tmp)
b <- match(rules[[2]], tmp)
sel <- !is.na(a) & !is.na(b)
if (any(!sel)) {
warning("Not all values in rules are present in x.")
a <- a[sel]
b <- b[sel]
}
res <- c_equivalence(as.integer(a)-1L, as.integer(b)-1L,
as.integer(length(tmp)))
res[match(x, tmp)] + 1L
}
<强> equivalence.cpp 强>
#include <R.h>
#include <Rinternals.h>
#include <string>
extern "C" {
// [[Rcpp::export]]
SEXP c_equivalence(SEXP ra, SEXP rb, SEXP rn) {
try {
if (LENGTH(ra) != LENGTH(rb))
throw std::string("Lengths of a and be do not match.");
int* a = INTEGER(ra);
int* b = INTEGER(rb);
int m = LENGTH(ra);
int n = INTEGER(rn)[0];
SEXP classes = PROTECT(allocVector(INTSXP, n));
int* cls = INTEGER(classes);
//Initialize each element its own class.
for (int k = 0; k < n; k++) cls[k] = k;
//For each piece of input information...
for (int l = 0; l < m; l++) {
//Track first element up to its ancestor.
int j = a[l];
while (cls[j] != j) j = cls[j];
//Track second element up to its ancestor.
int k = b[l];
while (cls[k] != k) k = cls[k];
//If they are not already related, make them so.
if (j != k) {
cls[j] = k;
}
}
//Final sweep up to highest ancestors.
for (int j = 0; j < n; j++) {
while (cls[j] != cls[cls[j]]) cls[j] = cls[cls[j]];
}
UNPROTECT(1);
return classes;
} catch(const std::string& e) {
error(e.c_str());
return R_NilValue;
} catch (...) {
error("Uncaught exception.");
return R_NilValue;
}
}
}
答案 2 :(得分:1)
好的,这是一个新的答案,它可以解决一些问题。同样,dat
是数据框。
Cluster <- rep(NA, length(dat[, 1])) #initialise
for(r in 1:length(Cluster)){
if(identical(as.numeric(r), 1)){Cmatches <- matrix(c(as.character(dat[1, 1]), as.character(dat[1, 2])), 2, 1)}
matched <- F
for(cl in 1:length(Cmatches[1,])){
if(sum(c(as.character(dat[r, 1]), as.character(dat[r, 2])) %in% Cmatches[, cl]) != 0){
#add P1 and P2 from this row to the cluster which it matches
Cmatches <- rbind(Cmatches, matrix(c(if(cl != 1){rep("", (cl - 1)*2)}else{character(0)}, as.character(dat[r, 1]), as.character(dat[r, 2]), if(cl != length(Cmatches[1,])){rep("", (length(Cmatches[1, ]) - cl)*2)}else{character(0)}), 2, length(Cmatches[1,]), byrow = F))
matched <- T
Cluster[r] <- cl
}
}
if(!matched){
#add a new cluster, because doesn't match any existing
Cmatches <- cbind(Cmatches, c(c(as.character(dat[r, 1]), as.character(dat[r, 2])), rep("", length(Cmatches[, 1]) - 2)))
Cluster[r] <- length(Cmatches[1,])
}
}
在此之后,您将使用Cmatch
矩阵,然后使用if(sum(match(Cmatch[, cl1], Cmatch[, cl2], incomparables = ""), na.rm = T) != 0)
检查群集之间的匹配(其中cl1
和cl2
是要比较的群集编号) 。如果该测试为真,则应对这些群集进行分组。
Cmatched <- rep(NA, length(Cmatches[1,]))
for(cl in 1:(length(Cmatches[1,]) - 1)){
for(cl2 in (cl + 1):length(Cmatches[1,])){
if(sum(match(Cmatches[, cl], Cmatches[, cl2], incomparables = ""), na.rm = T) != 0){
if(is.na(Cmatched[cl])){
Cluster[Cluster == cl2] <- cl
Cmatched[cl2] <- cl
}else{
Cluster[Cluster == cl2] <- Cmatched[cl]
Cmatched[cl2] <- cl
}
}
}
}
我认为你有答案。然后只需dat <- cbind(dat, Cluster)
。
答案 3 :(得分:0)