Question

参考@holzben Clustering: how to extract most distinguishing features?

回答的问题

使用SK-Means包，我设法获得了集群。我无法弄清楚为什么所有星团中的词频都这么小。这对我来说没有意义，因为我的数据集中有大约10,000条推文。我做错了什么？

我的数据集位于https://docs.google.com/a/siswa.um.edu.my/file/d/0B3-xuXnLwF0yTHAzbE5KbTlQWWM/edit

> class(myCorpus)
[1] "VCorpus" "Corpus"  "list"   
> dtm<-DocumentTermMatrix(myCorpus,control=list(wordLengths=c(1,Inf)))
> class(dtm)
[1] "DocumentTermMatrix"    "simple_triplet_matrix"
> clus <- skmeans(dtm, 3)
> clus
A hard spherical k-means partition of 10829 objects into 3 classes.
Class sizes: 2100, 6219, 2510
Call: skmeans(x = dtm, k = 3)
> mfrq_words_per_cluster <- function(clus, dtm, first = 6, unique = TRUE){
+   if(!any(class(clus) == "skmeans")) return("clus must be an skmeans object")
+   
+   dtm <- as.simple_triplet_matrix(dtm)
+   indM <- table(names(clus$cluster), clus$cluster) == 1 # generate bool matrix
+   
+   hfun <- function(ind, dtm){ # help function, summing up words
+     if(is.null(dtm[ind, ]))  dtm[ind, ] else  col_sums(dtm[ind, ])
+   }
+   frqM <- apply(indM, 2, hfun, dtm = dtm)
+   
+   if(unique){
+     # eliminate word which occur in several clusters
+     frqM <- frqM[rowSums(frqM > 0) == 1, ] 
+   }
+   # export to list, order and take first x elements 
+   res <- lapply(1:ncol(frqM), function(i, mat, first)
+     head(sort(mat[, i], decreasing = TRUE), first),
+     mat = frqM, first = first)
+   
+   names(res) <- paste0("CLUSTER_", 1:ncol(frqM))
+   return(res)
+ }
> mfrq_words_per_cluster(clus, dtm)
$CLUSTER_1
  srilanka    warrior airtickets   avionics        ayf   citizens 
     4          4          3          3          3          3 

$CLUSTER_2
   higher     jumpa        ec     bodoh komentari     batch 
       12        11         9         8         8         7 

$CLUSTER_3
       liong      ryanair           yi airlinescrew         aksi      berjaya 
           5            4            4            3            3            3

及以下是我用于获取上述群集的脚本：

require("tm")
require("skmeans")
require("slam")

clus <- skmeans(dtm, 3)

# clus: a skmeans object
# dtm: a Document Term Matrix
# first: eg. 10 most frequent words per cluster
# unique: if FALSE all words of the DTM will be used
#  if TRUE only cluster specific words will be used 
# result: List with words and frequency of words 
#         If unique = TRUE, only cluster specific words will be considered.
#         Words which occur in more than one cluster will be ignored.



mfrq_words_per_cluster <- function(clus, dtm, first = 6, unique = TRUE){
  if(!any(class(clus) == "skmeans")) return("clus must be an skmeans object")

  dtm <- as.simple_triplet_matrix(dtm)
  indM <- table(names(clus$cluster), clus$cluster) == 1 # generate bool matrix

  hfun <- function(ind, dtm){ # help function, summing up words
    if(is.null(dtm[ind, ]))  dtm[ind, ] else  col_sums(dtm[ind, ])
  }
  frqM <- apply(indM, 2, hfun, dtm = dtm)

  if(unique){
    # eliminate word which occur in several clusters
    frqM <- frqM[rowSums(frqM > 0) == 1, ] 
  }
  # export to list, order and take first x elements 
  res <- lapply(1:ncol(frqM), function(i, mat, first)
    head(sort(mat[, i], decreasing = TRUE), first),
    mat = frqM, first = first)

  names(res) <- paste0("CLUSTER_", 1:ncol(frqM))
  return(res)
}


mfrq_words_per_cluster(clus, dtm)

为什么群集词在大数据集中的频率如此之小？

0 个答案: