我仍在尝试学习R,并与威斯康星州乳腺癌数据集一起运行kmeans算法。
https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
我试图遍历多个K值,创建一个混淆矩阵并计算精度,但是我在代码中收到错误。
number of columns of result is not a multiple of vector length (arg
2)Error in cm[l, ] : subscript out of bounds
我正在尝试调整此方法以适合我的需求 http://cowlet.org/2013/12/23/understanding-data-science-clustering-with-k-means-in-r.html
# Step 1: Import the CSV format data "wisc_bc_data.csv"
wbcd <- read.csv("/Users/petermccabe/Downloads/wisc_bc_data.csv", sep=",")
# Step 2: Explore the data, e.g. examine the structure of the wbcd data frame, normalisation, and so on
str(wbcd)
# drop the id feature which is useless in this case
wbcd <- wbcd[-1]
# table of diagnosis (We ignore data balancing process)
table(wbcd$diagnosis)
# recode diagnosis as a factor
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"),
labels = c("0", "1"))
# table or proportions with more informative labels
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
# summarize three numeric features
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])
# create normalization function
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
# test normalization function - result should be identical
normalize(c(1, 2, 3, 4, 5))
normalize(c(10, 20, 30, 40, 50))
# normalize the wbcd data
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
wbcd_n$diagnosis <- wbcd$diagnosis
# confirm that normalization worked
summary(wbcd_n$area_mean)
#Traing a model on the data
k <- 8
means <- kmeans(wbcd_n[,1:31], k)
table(data.frame(wbcd_n$diagnosis, means$cluster))
#create function for confusion matrix
calculate.confusion <- function(diagnoses, clusters)
{
# generate a confusion matrix of cols C versus states S
d <- data.frame(diagnosis = diagnoses, cluster = clusters)
td <- as.data.frame(table(d))
# convert from raw counts to percentage of each label
pc <- matrix(ncol=max(clusters),nrow=0) # k cols
for (i in 1:9) # 9 labels
{
total <- sum(td[td$diagnosis==td$diagnosis[i],3])
pc <- rbind(pc, td[td$diagnosis==td$diagnosis[i],3]/total)
}
rownames(pc) <- td[1:9,1]
return(pc)
}
#create function to assign labels for each clustser
assign.cluster.labels <- function(cm, k)
{
# take the cluster label from the highest percentage in that column
cluster.labels <- list()
for (i in 1:k)
{
cluster.labels <- rbind(cluster.labels, row.names(cm)[match(max(cm[,i]), cm[,i])])
}
# this may still miss some labels, so make sure all labels are included
for (l in rownames(cm))
{
if (!(l %in% cluster.labels))
{
cluster.number <- match(max(cm[l,]), cm[l,])
cluster.labels[[cluster.number]] <- c(cluster.labels[[cluster.number]], l)
}
}
return(cluster.labels)
}
str(assign.cluster.labels(calculate.confusion(wbcd_n$diagnosis, means$cluster), k))
#calculate accuracy function
calculate.accuracy <- function(diagnoses, clabels)
{
matching <- Map(function(diagnosis, labels) { diagnosis %in% labels }, diagnoses, clabels)
tf <- unlist(matching, use.names=FALSE)
return (sum(tf)/length(tf))
}
#creating model for K 2-8
results <- matrix(ncol=2, nrow=0)
models <- list()
for (k in 2:8)
{
# Don't cluster columns for diagnosis
means <- kmeans(wbcd_n[,1:31], k)
# generate a confusion matrix of cols C versus states S
conf.mat <- calculate.confusion(wbcd_n$diagnosis, means$cluster)
cluster.labels <- assign.cluster.labels(conf.mat, k)
# Now calculate accuracy, using states and groups of labels for each cluster
accuracy <- calculate.accuracy(wbcd_n$diagnosis, cluster.labels[means$cluster])
results <- rbind(results, c(k, accuracy))
models[[(length(models)+1)]] <- means
}
每次K迭代都要建立一个混淆矩阵并计算出准确性