我是R的新手,正在按照书中列出的算法编写我自己的k表示函数" David Mackay的信息理论,推理和学习算法" 。我的尝试如下。该函数采用1维数据。如何针对多变量数据(具有多个特征向量的数据)修改此问题?任何暗示都会有所帮助。提前致谢。
######generate data#####
x<-c(rnorm(500, 4,2), rnorm(500,5,1.5),rnorm(300,1,0.5))
data<-as.data.frame(x)
d_scaled<-as.data.frame(scale(data))
#### Kmeans algorithm ########
##### calculates the euclidean distance between two points p and q
getEuclid_dist <- function(p,q) {
dist <- 0.5*sum((p-q)^2)
return(dist)
}
##### finds the nearest mean to datapoints out of the set of means
get_nearest_mean <- function(data, means) {
nmeans <- length(means$x)
dist <- rep(NA,nmeans)
for (k in 1:nmeans) {
dist[k] <- getEuclid_dist(data,means$x[k])
}
minval <- which(dist %in% sort(dist)[1])
return(minval)
}
mykmeans <- function( data, k ) {
nd <- length(data$x) # gets total number of datapoints
# 1. set each mean randomly to one of the datapoints
m <- sample(1:nd,k)
means <- list(x=data$x[m])
mprev <- means
# set initial responsibilities to 1
# col = datapoints nd
# row = cluster
# r(k,nd)=1 if it's in that cluster
rold <- matrix(0,k,nd)
rcurr <- matrix(1,k,nd)
#2. loop until the responsibilities have not changed
while (!all(rcurr==rold)) {
rold <- rcurr
rcurr <- matrix(0,k,nd)
#3. assignment step
for( n in 1:nd) {
kn <- get_nearest_mean(data$x[n],means)
rcurr[kn,n] <- 1
}
# 4. recalculate the means
for( nk in 1:k) {
currx <- data$x[rcurr[nk,]==1]
if (length(currx)!=0) {
means$x[nk] <- mean(currx)
}
return(list(means$x, rcurr))
}
}
}
clusobj<-mykmeans(data=d_scaled,k=3)