我使用spark-1.5.2使用GaussianMixture
对数据集进行聚类。除了生成的GaussianMixtureModel
之外,不会发生任何错误,并且它们的权重相同。达到指定容差所需的迭代次数约为2,这似乎太低了。
我可以调整哪些参数,以便群集形成不同的值?
import org.apache.spark.SparkContext
import org.apache.spark.rdd._
import org.apache.spark.mllib.clustering.GaussianMixture
import org.apache.spark.mllib.linalg.{Vector, Vectors}
def sparkContext: SparkContext = {
import org.apache.spark.SparkConf
new SparkContext(new SparkConf().setMaster("local[*]").setAppName("console"))
}
implicit val sc = sparkContext
def observationsRdd(implicit sc: SparkContext): RDD[Vector] = {
sc.textFile("observations.csv")
.map { line => Vectors.dense(line.split(",").map { _.toDouble }) }
}
val gmm = {new GaussianMixture()
.setK(6)
.setMaxIterations(1000)
.setConvergenceTol(0.001)
.setSeed(1)
.run(observationsRdd)}
for (i <- 0 until gmm.k) {
println("weight=%f\nmu=%s\nsigma=\n%s\n" format
(gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
}
截断输出:
weight=0.166667
mu=[4730.358845338535,4391.695550847029,4072.3224046605947,4253.183898304653,4454.124682202946,4775.553442796136,4980.3952860164545,4812.717637711368,5120.44449152493,2820.1827330505857,180.10291313557565,4189.185858050445,3690.793644067457]
sigma=
422700.24745093845 382225.3248240414 398121.9356855869 ... (13 total)
382225.3248240414 471186.33178427175 455777.0565262309 ...
398121.9356855869 455777.0565262309 461210.0532084378 ...
469361.3787142044 497432.39963363775 515341.1303306988 ...
474369.6318494179 482754.83801426284 500047.5114985542 ...
453832.62301188655 443147.58931290614 461017.7038258409 ...
458641.51202210854 433511.1974652861 452015.6655154465 ...
387980.29836054996 459673.3283909025 455118.78272128507 ...
461724.87201332086 423688.91832506843 442649.18455604656 ...
291940.48273324646 257309.1054220978 269116.23674394307 ...
16289.3063964479 14790.06803739929 15387.484828872432 ...
334045.5231910066 338403.3492767321 350531.7768916226 ...
280036.0894114749 267624.69326772855 279651.401859903 ...
weight=0.166667
mu=[4730.358845338535,4391.695550847029,4072.3224046605947,4253.183898304653,4454.124682202946,4775.553442796136,4980.3952860164545,4812.717637711368,5120.44449152493,2820.1827330505857,180.10291313557565,4189.185858050445,3690.793644067457]
sigma=
422700.24745093845 382225.3248240414 398121.9356855869 ... (13 total)
382225.3248240414 471186.33178427175 455777.0565262309 ...
398121.9356855869 455777.0565262309 461210.0532084378 ...
469361.3787142044 497432.39963363775 515341.1303306988 ...
474369.6318494179 482754.83801426284 500047.5114985542 ...
453832.62301188655 443147.58931290614 461017.7038258409 ...
458641.51202210854 433511.1974652861 452015.6655154465 ...
387980.29836054996 459673.3283909025 455118.78272128507 ...
461724.87201332086 423688.91832506843 442649.18455604656 ...
291940.48273324646 257309.1054220978 269116.23674394307 ...
16289.3063964479 14790.06803739929 15387.484828872432 ...
334045.5231910066 338403.3492767321 350531.7768916226 ...
280036.0894114749 267624.69326772855 279651.401859903 ...
weight=0.166667
mu=[4730.358845338535,4391.695550847029,4072.3224046605947,4253.183898304653,4454.124682202946,4775.553442796136,4980.3952860164545,4812.717637711368,5120.44449152493,2820.1827330505857,180.10291313557565,4189.185858050445,3690.793644067457]
sigma=
422700.24745093845 382225.3248240414 398121.9356855869 ... (13 total)
382225.3248240414 471186.33178427175 455777.0565262309 ...
398121.9356855869 455777.0565262309 461210.0532084378 ...
469361.3787142044 497432.39963363775 515341.1303306988 ...
474369.6318494179 482754.83801426284 500047.5114985542 ...
453832.62301188655 443147.58931290614 461017.7038258409 ...
458641.51202210854 433511.1974652861 452015.6655154465 ...
387980.29836054996 459673.3283909025 455118.78272128507 ...
461724.87201332086 423688.91832506843 442649.18455604656 ...
291940.48273324646 257309.1054220978 269116.23674394307 ...
16289.3063964479 14790.06803739929 15387.484828872432 ...
334045.5231910066 338403.3492767321 350531.7768916226 ...
280036.0894114749 267624.69326772855 279651.401859903 ...
weight=0.166667
mu=[4730.358845338535,4391.695550847029,4072.3224046605947,4253.183898304653,4454.124682202946,4775.553442796136,4980.3952860164545,4812.717637711368,5120.44449152493,2820.1827330505857,180.10291313557565,4189.185858050445,3690.793644067457]
sigma=
422700.24745093845 382225.3248240414 398121.9356855869 ... (13 total)
382225.3248240414 471186.33178427175 455777.0565262309 ...
398121.9356855869 455777.0565262309 461210.0532084378 ...
469361.3787142044 497432.39963363775 515341.1303306988 ...
474369.6318494179 482754.83801426284 500047.5114985542 ...
453832.62301188655 443147.58931290614 461017.7038258409 ...
458641.51202210854 433511.1974652861 452015.6655154465 ...
387980.29836054996 459673.3283909025 455118.78272128507 ...
461724.87201332086 423688.91832506843 442649.18455604656 ...
291940.48273324646 257309.1054220978 269116.23674394307 ...
16289.3063964479 14790.06803739929 15387.484828872432 ...
334045.5231910066 338403.3492767321 350531.7768916226 ...
280036.0894114749 267624.69326772855 279651.401859903 ...
weight=0.166667
mu=[4730.358845338535,4391.695550847029,4072.3224046605947,4253.183898304653,4454.124682202946,4775.553442796136,4980.3952860164545,4812.717637711368,5120.44449152493,2820.1827330505857,180.10291313557565,4189.185858050445,3690.793644067457]
sigma=
422700.24745093845 382225.3248240414 398121.9356855869 ... (13 total)
382225.3248240414 471186.33178427175 455777.0565262309 ...
398121.9356855869 455777.0565262309 461210.0532084378 ...
469361.3787142044 497432.39963363775 515341.1303306988 ...
474369.6318494179 482754.83801426284 500047.5114985542 ...
453832.62301188655 443147.58931290614 461017.7038258409 ...
458641.51202210854 433511.1974652861 452015.6655154465 ...
387980.29836054996 459673.3283909025 455118.78272128507 ...
461724.87201332086 423688.91832506843 442649.18455604656 ...
291940.48273324646 257309.1054220978 269116.23674394307 ...
16289.3063964479 14790.06803739929 15387.484828872432 ...
334045.5231910066 338403.3492767321 350531.7768916226 ...
280036.0894114749 267624.69326772855 279651.401859903 ...
weight=0.166667
mu=[4730.358845338535,4391.695550847029,4072.3224046605947,4253.183898304653,4454.124682202946,4775.553442796136,4980.3952860164545,4812.717637711368,5120.44449152493,2820.1827330505857,180.10291313557565,4189.185858050445,3690.793644067457]
sigma=
422700.24745093845 382225.3248240414 398121.9356855869 ... (13 total)
382225.3248240414 471186.33178427175 455777.0565262309 ...
398121.9356855869 455777.0565262309 461210.0532084378 ...
469361.3787142044 497432.39963363775 515341.1303306988 ...
474369.6318494179 482754.83801426284 500047.5114985542 ...
453832.62301188655 443147.58931290614 461017.7038258409 ...
458641.51202210854 433511.1974652861 452015.6655154465 ...
387980.29836054996 459673.3283909025 455118.78272128507 ...
461724.87201332086 423688.91832506843 442649.18455604656 ...
291940.48273324646 257309.1054220978 269116.23674394307 ...
16289.3063964479 14790.06803739929 15387.484828872432 ...
334045.5231910066 338403.3492767321 350531.7768916226 ...
280036.0894114749 267624.69326772855 279651.401859903 ...
...
此外,代码,输入数据和输出数据可作为要点@ https://gist.github.com/aaron-santos/91b4931a446c460e082b2b3055b9950f
谢谢
答案 0 :(得分:0)
我通过ELKI运行您的数据(我必须删除最后一行,这是不完整的)。它起初也不起作用,我假设这是由于属性的 scale 以及默认初始化。 Spark中可能存在同样的问题。
在缩放数据之后,我可以使用ELKI获得一些合理的聚类(可视化13个维度中的前三个):
但从数据点的分布判断我认为高斯混合建模不适用于此数据。这些点似乎是从某些超曲面或某些轨迹中进行网格采样的;不是来自高斯(!)分布。
以下是我使用的ELKI参数:
-dbc.in /tmp/observations.csv
-dbc.filter normalization.columnwise.AttributeWiseVarianceNormalization
-algorithm clustering.em.EM -em.k 6
-em.centers RandomlyChosenInitialMeans -kmeans.seed 0
可能值得尝试其他聚类算法,例如HDBSCAN,它可以识别基于密度的聚类:
参数:
-dbc.in /tmp/observations.csv
-dbc.filter normalization.columnwise.AttributeWiseVarianceNormalization
-algorithm clustering.hierarchical.extraction.HDBSCANHierarchyExtraction
-algorithm SLINKHDBSCANLinearMemory
-hdbscan.minPts 50 -hdbscan.minclsize 100
我也会尝试OPTICS,因为我发现HDBSCAN通常只捕获集群的核心(按设计)。从OPTICS图中,我不会说集群的定义非常明确。
除了尝试其他聚类算法之外,我认为您还需要在预处理和投影数据方面做很多工作,因为它具有非常强的相关性。尝试将关于数据的尽可能多的先验知识放入预处理中以改善结果。