我计算了R(包NbClust
)和SAS(https://support.sas.com/documentation/onlinedoc/v82/techreport_a108.pdf)中的CCC指标。
除了E_R2和CCC之外,所有P-F和R-square都与SAS输出完全匹配。我已经尝试过多种方式但没有得到任何解释。
我附上了" sample_data.csv"," initial_seed.csv"和" SAS_cluster.Rdata"我在这里使用的文件: https://drive.google.com/folderview?id=0ByZXEsgyZupEemFTdWRuTEVmZ1k&usp=sharing
以下是R和SAS中预期R2(E_R2)的值:
E_R2 = 0.4630339 vs. ERSQ = 0.3732597284
你可以帮我找一下后台出了什么问题吗?
请在下面找到我用过的代码:
----------
<SAS-Code>
----------
proc fastclus data=sample_data maxiter=100 seed=initial_seed maxc=5 outstat=metrics out=output;
Var v1 v2 v3 v4 v5 v6;
run;
----------
<R-Code>
----------
load("SAS_cluster.Rdata")
clust.perf.metrics <- function(data, cl) {
data1 <- as.matrix(data)
numberObsBefore <- dim(data1)[1]
data <- na.omit(data1)
nn <- numberObsAfter <- dim(data)[1]
pp <- dim(data)[2]
qq <- max(cl)
TT <- t(data) %*% data
sizeEigenTT <- length(eigen(TT)$value)
eigenValues <- eigen(TT/(nn - 1))$value
for (i in 1:sizeEigenTT) {
if (eigenValues[i] < 0) {
cat(paste("There are only", numberObsAfter, "non-missing observations out of a possible",numberObsBefore, "observations."))
stop("The TSS matrix is indefinite. There must be too many missing values. The index cannot be calculated.")
}
}
s1 <- sqrt(eigenValues)
ss <- rep(1, sizeEigenTT)
for (i in 1:sizeEigenTT) {
if (s1[i] != 0)
ss[i] = s1[i]
}
vv <- prod(ss)
z <- matrix(0, ncol = qq, nrow = nn)
clX <- as.matrix(cl)
for (i in 1:nn)
for (j in 1:qq) {
z[i, j] == 0
if (clX[i, 1] == j) z[i, j] = 1
}
xbar <- solve(t(z) %*% z) %*% t(z) %*% data
B <- t(xbar) %*% t(z) %*% z %*% xbar
W <- TT - B
R2 <- 1 - (sum(diag(W))/sum(diag(TT)))
PseudoF <- (sum(diag(B))/(qq-1))/(sum(diag(W))/(nn-qq))
v1 <- 1
u1 <- rep(0, pp)
c1 <- (vv/qq)^(1/pp)
u1 <- ss/c1
k1 <- sum((u1 >= 1) == TRUE)
p1 <- min(k1, qq - 1)
if (all(p1 > 0, p1 < pp)) {
for (i in 1:p1) { v1 <- v1 * ss[i]}
c <- (v1/qq)^(1/p1)
u <- ss/c
b1 <- sum(1/(nn + u[1:p1]))
b2 <- sum(u[(p1 + 1):pp]^2/(nn + u[(p1 + 1):pp]), na.rm = TRUE)
E_R2 <- 1 - ((b1 + b2)/sum(u^2)) * ((nn - qq)^2/nn) * (1 + (4/nn))
ccc <- log((1 - E_R2)/(1 - R2)) * (sqrt(nn * p1/2)/((0.001 + E_R2)^1.2))
} else {
b1 <- sum(1/(nn + u))
E_R2 <- 1 - (b1/sum(u^2)) * ((nn - qq)^2/nn) * (1 + 4/nn)
ccc <- log((1 - E_R2)/(1 - R2)) * (sqrt(nn * pp/2)/((0.001 + E_R2)^1.2))
}
results <- list(R_2=R2, PseudoF=PseudoF, CCC = ccc, E_R2=E_R2);return(results)
}
clust.perf.metrics(output[,1:6],output[,7])
谢谢和问候,