当我运行此代码时
val conf = new SparkConf().setMaster("local[2]").setAppName("MLlib")
val sc = new SparkContext(conf)
val data = sc.textFile("hdfs://192.168.1.20:8020/user/sparkMLlib/input/testMLlib.csv")
val parsedData = data.map(s => Vectors.dense(s.split(',').map(_.toDouble))).cache()
// Cluster the data into two classes using KMeans
val numClusters = 2
val numIterations = 20
val clusters = KMeans.train(parsedData, numClusters, numIterations)
// Evaluate clustering by computing Within Set Sum of Squared Errors
val WSSSE = clusters.computeCost(parsedData)
println("Within Set Sum of Squared Errors = " + WSSSE)
// Save and load model
clusters.save(sc, "hdfs://192.168.1.20:8020/user/sparkMLlib/output/")
val sameModel = KMeansModel.load(sc, "hdfs://192.168.1.20:8020/user/sparkMLlib/output/")
我收到运行时异常:
16/05/04 14:43:40 ERROR DefaultWriterContainer: Aborting task.
java.lang.NoClassDefFoundError: org/apache/spark/sql/types/GenericArrayData
at org.apache.spark.mllib.linalg.VectorUDT.serialize(Vectors.scala:207)
at org.apache.spark.mllib.linalg.VectorUDT.serialize(Vectors.scala:179)
at org.apache.spark.sql.catalyst.CatalystTypeConverters$UDTConverter.toCatalystImpl(CatalystTypeConverters.scala:142)
at org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:102)
at org.apache.spark.sql.catalyst.CatalystTypeConverters$$anonfun$createToCatalystConverter$2.apply(CatalystTypeConverters.scala:401)
at org.apache.spark.sql.execution.RDDConversions$$anonfun$productToRowRdd$1$$anonfun$apply$1.apply(ExistingRDD.scala:39)
at org.apache.spark.sql.execution.RDDConversions$$anonfun$productToRowRdd$1$$anonfun$apply$1.apply(ExistingRDD.scala:36)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:263)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.types.GenericArrayData
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
我根本不明白这一点。这是一个导入或依赖问题,我到处都看,但我一无所获。这似乎是一个非常奇怪的例外。谁能解释一下?