在R中聚类定性数据

时间:2014-02-26 01:13:51

标签: r cluster-analysis

我有一个看起来像这样的data.frame(df):

              ZN.N         ZL.N
MMP2   (1.89,3.58]   (2.13,4.1] 
AEBP1  (1.89,3.58]   (2.13,4.1]
A1AG1  (1.89,3.58]   (2.13,4.1]
A1AT  [0.364,1.89] [0.275,2.13]
A2MG  [0.364,1.89] [0.275,2.13]
ENOA   (1.89,3.58]   (2.13,4.1]

我想根据两个变量(ZN.N和ZL.N)聚类 row.names (蛋白质)。我可以对这类数据使用 k.means 方法或层次聚类吗?

我试过

df.k2 <- k.means(df, 2) 

但它不起作用。我对群集真的很陌生,所以道歉这个问题是否真的很愚蠢,非常感谢

以下是我的data.frame

dput
structure(list(ZN.N = structure(c(2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("[0.364,1.89]", "(1.89,3.58]"), class = "factor"), 
ZL.N = structure(c(2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 
2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 
2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 
1L, 1L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 1L, 
3L, 3L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 1L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 2L, 2L, 3L, 1L, 3L, 2L, 
1L, 1L, 2L, 3L, 1L), .Label = c("[0.275,2.13]", "(2.13,4.1]", 
"(4.1,6.78]"), class = "factor")), .Names = c("ZN.N", "ZL.N"), class = "data.frame", row.names = c("MMP2", "AEBP1", "A1AG1", "A1AT", "A2MG", "ENOA", "ANGI", "ANGL2", "ANT3", "APOA1", "APOA2", "APOD", "PGBM", "PGS1", "CAH3", "CRAC1", "CILP1", "CILP2", "COMP", "CH3L1", "CH3L2", "CSPG4", "CCD80", "CO1A1", "CO2A1", "CO3A1", "CO6A1", "COCA1", "COFA1", "COIA1", "CO1A2", "CO6A2", "COBA2", "CO6A3", "C1QB", "C1R", "C1S", "CO3", "CO4B", "CO8A", "CFAB", "CFAH", "CRP", "KCRM", "CLC3A", "ECM1", "FIBA", "FIBB", "FIBG", "FGFP2", "FMOD", "FINC", "FBLN1", "FSTL1", "G3P", "HPT", "HBA", "HBB", "H2B1L", "H32", "H4", "HPLN1", "IGHA1", "IGHG1", "IGKC", "LAC6", "IGHM", "INHBA", "IBP3", "ITIH1", "MMP1", "LDHA", "LYSC", "TIMP1", "TIMP2", "MIME", "MOES", "MYG", "NID2", "NUCB1", "OSTP", "PPIA", "PPIB", "POSTN", "PRDX2", "PGAM1", "PA2GA", "PLTP", "PEDF", "IPSP", "LMNA", "PCOC1", "PRELP", "AMBP", "PDIA3", "PDIA6", "S10AA", "S10A8", "PRG4", "KPYM", "RNAS1", "HTRA1", "TRFE", "ALBU", "SAMP", "SMOC2", "MMP3", "TARSH", "TENA", "TENX", "TETN", "TSP3", "TSP4", "BGH3", "TTHY", "TR11B", "RL40", "CSPG2", "VIME", "VTNC"))

1 个答案:

答案 0 :(得分:2)

您遇到群集问题的原因是kmeans需要数字矩阵,但您要为函数提供带有因子变量的数据框。

您可以将这些因素转换为数字,然后运行kmeans

set.seed(144)
df$ZN.N <- as.numeric(df$ZN.N)
df$ZL.N <- as.numeric(df$ZL.N)
clusters <- kmeans(df, 2)$cluster

clusters1 <- names(clusters[clusters == 1])
clusters1
#  [1] "MMP2"  "AEBP1" "A1AG1" "ENOA"  "APOA1" "PGS1"  "CAH3"  "CO1A1" "CO3A1"
# [10] "C1R"   "CO8A"  "CRP"   "KCRM"  "FIBB"  "FIBG"  "HPT"   "HBA"   "H32"  
# [19] "H4"    "IGHG1" "IGKC"  "INHBA" "MYG"   "NID2"  "POSTN" "PLTP"  "PEDF" 
# [28] "LMNA"  "PDIA3" "PDIA6" "S10AA" "S10A8" "TENA"  "TETN"  "TSP3"  "BGH3" 
# [37] "VIME" 
clusters2 <- names(clusters[clusters == 2])
clusters2
#  [1] "A1AT"  "A2MG"  "ANGI"  "ANGL2" "ANT3"  "APOA2" "APOD"  "PGBM"  "CRAC1"
# [10] "CILP1" "CILP2" "COMP"  "CH3L1" "CH3L2" "CSPG4" "CCD80" "CO2A1" "CO6A1"
# [19] "COCA1" "COFA1" "COIA1" "CO1A2" "CO6A2" "COBA2" "CO6A3" "C1QB"  "C1S"  
# [28] "CO3"   "CO4B"  "CFAB"  "CFAH"  "CLC3A" "ECM1"  "FIBA"  "FGFP2" "FMOD" 
# [37] "FINC"  "FBLN1" "FSTL1" "G3P"   "HBB"   "H2B1L" "HPLN1" "IGHA1" "LAC6" 
# [46] "IGHM"  "IBP3"  "ITIH1" "MMP1"  "LDHA"  "LYSC"  "TIMP1" "TIMP2" "MIME" 
# [55] "MOES"  "NUCB1" "OSTP"  "PPIA"  "PPIB"  "PRDX2" "PGAM1" "PA2GA" "IPSP" 
# [64] "PCOC1" "PRELP" "AMBP"  "PRG4"  "KPYM"  "RNAS1" "HTRA1" "TRFE"  "ALBU" 
# [73] "SAMP"  "SMOC2" "MMP3"  "TARSH" "TENX"  "TSP4"  "TTHY"  "TR11B" "RL40" 
# [82] "CSPG2" "VTNC" 

在此代码中,ZN.N被转换为数字1和2,ZL.N被转换为数字1,2和3. kmeans然后计算欧洲之间的欧氏距离聚类的点。您必须确定这对您的应用程序是否有意义。