我正在尝试从org.apache.spark.ml.clustering.KMeans运行KMeans。但是它在某些内部函数中失败了,说某些向量的范数是负数。相同的代码在不同的输入上成功。 KMeans的输入是另一种算法的输出。
val kMeans = new KMeans()
.setK(kmeans_num_clusters)
.setInitMode("k-means||")
.setInitSteps(10)
.setMaxIter(300)
.setTol(0.0001)
.setFeaturesCol("features")
.setPredictionCol("s2Label")
val kMeansModel = kMeans.fit(FeaturesVecs)
val clusteredDF = kMeansModel.transform(FeaturesVecs)
这是错误日志。
18/12/04 13:31:59 ERROR ApplicationMaster: User class threw exception: java.lang.IllegalArgumentException: requirement failed
java.lang.IllegalArgumentException: requirement failed
at scala.Predef$.require(Predef.scala:212)
at org.apache.spark.mllib.util.MLUtils$.fastSquaredDistance(MLUtils.scala:487)
at org.apache.spark.mllib.clustering.KMeans$.fastSquaredDistance(KMeans.scala:606)
at org.apache.spark.mllib.clustering.LocalKMeans$$anonfun$3.apply(LocalKMeans.scala:49)
at org.apache.spark.mllib.clustering.LocalKMeans$$anonfun$3.apply(LocalKMeans.scala:49)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at org.apache.spark.mllib.clustering.LocalKMeans$.kMeansPlusPlus(LocalKMeans.scala:49)
at org.apache.spark.mllib.clustering.KMeans$$anonfun$24.apply(KMeans.scala:472)
at org.apache.spark.mllib.clustering.KMeans$$anonfun$24.apply(KMeans.scala:469)
at scala.collection.parallel.immutable.ParRange$ParRangeIterator.map2combiner(ParRange.scala:105)
at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1054)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply$mcV$sp(Tasks.scala:49)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
at scala.collection.parallel.Task$class.tryLeaf(Tasks.scala:51)
at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1051)
在查看文件MLUtils.scala时,第487行是常规> = 0检查。
private[mllib] def fastSquaredDistance(
v1: Vector,
norm1: Double,
v2: Vector,
norm2: Double,
precision: Double = 1e-6): Double = {
val n = v1.size
require(v2.size == n)
require(norm1 >= 0.0 && norm2 >= 0.0)
val sumSquaredNorm = norm1 * norm1 + norm2 * norm2
val normDiff = norm1 - norm2
var sqDist = 0.0