["Handle", "Work", "Image", "Firstname", "Lastname", "Email"].forEach(v => {
if t[v] != "" {
item[v] = t[v]
}
});
我的火车和测试数据采用以下格式。
火车数据:
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.SparkConf
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint`enter code here`
import org.apache.spark.streaming.{Seconds, StreamingContext}
sc.setLogLevel("OFF")
sc.stop
val conf = new
SparkConf().setMaster("local[2]").setAppName("HDFSkWordCount")
val ssc = new StreamingContext(conf, Seconds(10))
val trainData = ssc.textFileStream("hdfs://localhost:9000/user/streaming/train/").map(Vectors.parse).cache()
val test_data = ssc.textFileStream("hdfs://localhost:9000/user/streaming/test/").map(LabeledPoint.parse)
val initialCenters = Array(Vectors.dense(50.0, 50.0), Vectors.dense(400.0,400.0), Vectors.dense(1000.0,1000.0))
val model = new StreamingKMeans().setK(3).setDecayFactor(1.0).setInitialCenters(initialCenters,Array((0.0),(0.0),(0.0)))
model.trainOn(trainData)
model.predictOnValues(test_data.map(lp => (lp.label.toInt, lp.features))).print()
ssc.start()
ssc.awaitTerminationOrTimeout(10)
测试数据:
[1, 2]
[3, 4]
[5, 6]
[7, 8]
[9, 10]
[11, 12]
[13, 14]
[15, 16]
[17, 18]
[19, 20]
[21, 22]
[23, 24]
[25, 26]
[27, 28]
[29, 30]
我的训练和测试文本文件位于Hadoop的相应文件夹中。
数据格式是否有问题?还是代码本身是错误的? 任何帮助将不胜感激。
谢谢