我正在尝试在virtualbox中连接两个工作程序。环境spark 2.0.2 scala 2.11.7 sbt 0.13.9 IDE scala在eclipse .dataset文件中上传到HDFS.in webui母版中,应用程序已完成。 Spark在我的本地计算机上运行独立模式
build.sbt
name := "movielens"
version := "1.0"
scalaVersion := "2.11.7"
libraryDependencies += "org.apache.spark" %% "spark-core" % 2.0.2" %
libraryDependencies += "org.apache.spark" %% "spark-mllib" % 2.0.2" %
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.0.2"%
libraryDependencies += "com.github.scopt" %% "scopt" % "3.3.0"%
我的代码
object trainModel {
def main(args:Array[String]): Unit =
{
val conf = new SparkConf
conf.setMaster("spark://192.168.1.8:7077")//local[*]
.set("spark.executor.memory","1g")
.setAppName("trainModel")
val sc = new SparkContext(conf)
val rawData = sc.textFile("hdfs://localhost:9001/data/rating.csv")
val sqlContext = new SQLContext(sc)
val ratings = rawData.map(line => line.split(",").take(3) match
{
case Array(userId, movieId, rating) => Rating(userId.toInt,
movieId.toInt, rating.toDouble)
})
println(s"Number of Ratings in Movie file ${ratings.count()} \n")
val ratingsRDD = sc.textFile("hdfs://localhost:9001/dataset/rating.csv")
import sqlContext.implicits._
val splits = ratingsRDD.randomSplit(Array(0.8, 0.2), seed = 12345)
val trainingRatingsRDD= splits(0).cache()
val testRatingsRDD = splits(1).cache()
val numTraining = trainingRatingsRDD.count()
val numTest = testRatingsRDD.count()
println(s"Training: $numTraining, test: $numTest.")
val rank = 10
val lambdas = 10
val numIterations = 10
val model = ALS.train(ratings, rank, numIterations)
val userProducts = ratings.map{case Rating(userId, movieId, rating) =>
(userId,movieId)}
val predictions = model.predict(userProducts).map{case
Rating(userId, movieId, rating) =>
((userId, movieId), rating)}
val ratesAndPreds = ratings.map{case Rating(userId, movieId, rating) =>
((userId,movieId),
rating)}.join(predictions)
val meanSquaredError = ratesAndPreds.map{case ((userId,movieId),
(r1,r2)) =>
val err = r1 -r2
err*err}.mean
println("Mean Squared Error= "+meanSquaredError)
sqrt(meanSquaredError)
val rmse = math.sqrt(meanSquaredError)
println(s"Test RMSE = $rmse.")
}
}
错误
19/07/19 22:37:11 WARN TaskSetManager: Lost task 2.0 in stage 0.0 (TID 2, 127.0.0.1): java.lang.ClassNotFoundException: com.sparkRDD.trainModel$$anonfun$1
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1868)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1751)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2042)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
19/07/19 22:37:11 INFO TaskSetManager: Starting task 2.1 in stage 0.0 (TID 6, 127.0.0.1, partition 2, ANY, 5320 bytes)
19/07/19 22:37:11 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Launching task 6 on executor id: 0 hostname: 127.0.0.1.
19/07/19 22:37:11 INFO TaskSetManager: Lost task 1.0 in stage 0.0 (TID 1) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 1]
19/07/19 22:37:11 INFO TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 2]
19/07/19 22:37:11 INFO TaskSetManager: Lost task 3.0 in stage 0.0 (TID 3) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 3]
19/07/19 22:37:11 INFO TaskSetManager: Lost task 4.0 in stage 0.0 (TID 4) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 4]
19/07/19 22:37:11 INFO TaskSetManager: Starting task 4.1 in stage 0.0 (TID 7, 127.0.0.1, partition 4, ANY, 5320 bytes)
19/07/19 22:37:11 INFO TaskSetManager: Starting task 3.1 in stage 0.0 (TID 8, 127.0.0.1, partition 3, ANY, 5320 bytes)
19/07/19 22:37:11 INFO TaskSetManager: Lost task 2.1 in stage 0.0 (TID 6) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 5]
19/07/19 22:37:11 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Launching task 7 on executor id: 0 hostname: 127.0.0.1.
19/07/19 22:37:11 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Launching task 8 on executor id: 0 hostname: 127.0.0.1.
19/07/19 22:37:11 INFO TaskSetManager: Lost task 5.0 in stage 0.0 (TID 5) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 6]
19/07/19 22:37:11 INFO TaskSetManager: Starting task 5.1 in stage 0.0 (TID 9, 127.0.0.1, partition 5, ANY, 5320 bytes)
19/07/19 22:37:11 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Launching task 9 on executor id: 0 hostname: 127.0.0.1.
19/07/19 22:37:11 INFO TaskSetManager: Lost task 4.1 in stage 0.0 (TID 7) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 7]
19/07/19 22:37:11 INFO TaskSetManager: Starting task 4.2 in stage 0.0 (TID 10, 127.0.0.1, partition 4, ANY, 5320 bytes)
19/07/19 22:37:11 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Launching task 10 on executor id: 0 hostname: 127.0.0.1.
19/07/19 22:37:11 INFO TaskSetManager: Lost task 3.1 in stage 0.0 (TID 8) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 8]
19/07/19 22:37:11 INFO TaskSetManager: Starting task 3.2 in stage 0.0 (TID 11, 127.0.0.1, partition 3, ANY, 5320 bytes)
19/07/19 22:37:11 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Launching task 11 on executor id: 0 hostname: 127.0.0.1.
19/07/19 22:37:11 INFO TaskSetManager: Lost task 4.2 in stage 0.0 (TID 10) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 9]
19/07/19 22:37:11 INFO TaskSetManager: Starting task 4.3 in stage 0.0 (TID 12, 127.0.0.1, partition 4, ANY, 5320 bytes)
19/07/19 22:37:11 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Launching task 12 on executor id: 0 hostname: 127.0.0.1.
19/07/19 22:37:11 INFO TaskSetManager: Lost task 5.1 in stage 0.0 (TID 9) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 10]
19/07/19 22:37:11 INFO TaskSetManager: Starting task 5.2 in stage 0.0 (TID 13, 127.0.0.1, partition 5, ANY, 5320 bytes)
19/07/19 22:37:11 INFO TaskSetManager: Lost task 3.2 in stage 0.0 (TID 11) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 11]
19/07/19 22:37:11 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Launching task 13 on executor id: 0 hostname: 127.0.0.1.
19/07/19 22:37:11 INFO TaskSetManager: Starting task 3.3 in stage 0.0 (TID 14, 127.0.0.1, partition 3, ANY, 5320 bytes)
19/07/19 22:37:11 INFO TaskSetManager: Lost task 4.3 in stage 0.0 (TID 12) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 12]
19/07/19 22:37:11 ERROR TaskSetManager: Task 4 in stage 0.0 failed 4 times; aborting job
19/07/19 22:37:11 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Launching task 14 on executor id: 0 hostname: 127.0.0.1.
19/07/19 22:37:11 INFO TaskSetManager: Lost task 5.2 in stage 0.0 (TID 13) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 13]
19/07/19 22:37:11 INFO TaskSetManager: Lost task 3.3 in stage 0.0 (TID 14) on executor 127.0.0.1: java.lang.ClassNotFoundException (com.sparkRDD.trainModel$$anonfun$1) [duplicate 14]
19/07/19 22:37:11 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
19/07/19 22:37:11 INFO TaskSchedulerImpl: Cancelling stage 0
19/07/19 22:37:11 INFO DAGScheduler: ResultStage 0 (count at trainModel.scala:45) failed in 10.613 s
19/07/19 22:37:11 INFO DAGScheduler: Job 0 failed: count at trainModel.scala:45, took 10.945581 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 0.0 failed 4 times, most recent failure: Lost task 4.3 in stage 0.0 (TID 12, 127.0.0.1): java.lang.ClassNotFoundException: com.sparkRDD.trainModel$$anonfun$1
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1868)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1751)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2042)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1873)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1886)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1899)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1913)
at org.apache.spark.rdd.RDD.count(RDD.scala:1134)
at com.sparkRDD.trainModel$.main(trainModel.scala:45)
at com.sparkRDD.trainModel.main(trainModel.scala)
Caused by: java.lang.ClassNotFoundException: com.sparkRDD.trainModel$$anonfun$1
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1868)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1751)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2042)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
请,如何解决此错误? 预先感谢