Question

我仍在尝试学习R，并与威斯康星州乳腺癌数据集一起运行kmeans算法。

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

我试图遍历多个K值，创建一个混淆矩阵并计算精度，但是我在代码中收到错误。

number of columns of result is not a multiple of vector length (arg 
2)Error in cm[l, ] : subscript out of bounds

我正在尝试调整此方法以适合我的需求 http://cowlet.org/2013/12/23/understanding-data-science-clustering-with-k-means-in-r.html

# Step 1: Import the CSV format data "wisc_bc_data.csv"
wbcd <- read.csv("/Users/petermccabe/Downloads/wisc_bc_data.csv", sep=",")
# Step 2: Explore the data, e.g. examine the structure of the wbcd data frame, normalisation, and so on
str(wbcd)

# drop the id feature which is useless in this case
wbcd <- wbcd[-1]


# table of diagnosis (We ignore data balancing process)
table(wbcd$diagnosis)

# recode diagnosis as a factor
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"),
                         labels = c("0", "1"))

# table or proportions with more informative labels
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)

# summarize three numeric features
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])

# create normalization function
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

# test normalization function - result should be identical
normalize(c(1, 2, 3, 4, 5))
normalize(c(10, 20, 30, 40, 50))

# normalize the wbcd data
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
wbcd_n$diagnosis <- wbcd$diagnosis

# confirm that normalization worked
summary(wbcd_n$area_mean)

#Traing a model on the data
k <- 8
means <- kmeans(wbcd_n[,1:31], k)


table(data.frame(wbcd_n$diagnosis, means$cluster))


#create function for confusion matrix

calculate.confusion <- function(diagnoses, clusters)
{
  # generate a confusion matrix of cols C versus states S
  d <- data.frame(diagnosis = diagnoses, cluster = clusters)
  td <- as.data.frame(table(d))
  # convert from raw counts to percentage of each label
  pc <- matrix(ncol=max(clusters),nrow=0) # k cols
  for (i in 1:9) # 9 labels
  {
    total <- sum(td[td$diagnosis==td$diagnosis[i],3])
    pc <- rbind(pc, td[td$diagnosis==td$diagnosis[i],3]/total)
  }
  rownames(pc) <- td[1:9,1]
  return(pc)
}

#create function to assign labels for each clustser


assign.cluster.labels <- function(cm, k)
{
  # take the cluster label from the highest percentage in that column
  cluster.labels <- list()
  for (i in 1:k)
  {
    cluster.labels <- rbind(cluster.labels, row.names(cm)[match(max(cm[,i]), cm[,i])])
  }

  # this may still miss some labels, so make sure all labels are included
  for (l in rownames(cm)) 
  { 
    if (!(l %in% cluster.labels)) 
    { 
      cluster.number <- match(max(cm[l,]), cm[l,])
      cluster.labels[[cluster.number]] <- c(cluster.labels[[cluster.number]], l)
    } 
  }
  return(cluster.labels)
}



str(assign.cluster.labels(calculate.confusion(wbcd_n$diagnosis, means$cluster), k))


#calculate accuracy function

calculate.accuracy <- function(diagnoses, clabels)
{
  matching <- Map(function(diagnosis, labels) { diagnosis %in% labels }, diagnoses, clabels)
  tf <- unlist(matching, use.names=FALSE)
  return (sum(tf)/length(tf))
}


#creating model for K 2-8
results <- matrix(ncol=2, nrow=0)
models <- list()

for (k in 2:8)
{
  # Don't cluster columns for diagnosis 
  means <- kmeans(wbcd_n[,1:31], k)

  # generate a confusion matrix of cols C versus states S
  conf.mat <- calculate.confusion(wbcd_n$diagnosis, means$cluster)
  cluster.labels <- assign.cluster.labels(conf.mat, k)

  # Now calculate accuracy, using states and groups of labels for each cluster
  accuracy <- calculate.accuracy(wbcd_n$diagnosis, cluster.labels[means$cluster])
  results <- rbind(results, c(k, accuracy))
  models[[(length(models)+1)]] <- means
}

每次K迭代都要建立一个混淆矩阵并计算出准确性

结果的列数不是向量长度的整数（arg 2）cm [l，]中的错误：下标超出范围

0 个答案: