有人能帮助我吗?
我在IntelliJ IDEA中运行了一个Spark应用程序。
object MainDriver {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Spark Sentiment Analysis").setMaster("local[2]")
val sc = new SparkContext(conf)
val posWords = sc.textFile("src/main/resources/Hu_Liu_positive_word_list.txt")
val negWords = sc.textFile("src/main/resources/Hu_Liu_negative_word_list.txt")
val nltkStopWords = sc.textFile("src/main/resources/stopwords/english")
val moreStopWds = sc.parallelize(List("cant", "didnt", "doesnt", "dont", "goes", "isnt", "hes",
"shes", "thats", "theres", "theyre", "wont", "youll", "youre",
"youve", "br", "ve", "re", "vs", "dick", "ginger", "hollywood",
"jack", "jill", "john", "karloff", "kudrow", "orson", "peter", "tcm",
"tom", "toni", "welles", "william", "wolheim", "nikita"))
val stopWordsRDD = (nltkStopWords union moreStopWds).filter(_ != "").cache()
val stopWordsList = sc.broadcast(stopWordsRDD.collect())
val inTrainUnsup = sc.wholeTextFiles("src/main/resources/reviews/train/unsup")
val parsedTrainUnsup = inTrainUnsup mapValues (
_ map {
case c: Char if Character.isLetterOrDigit(c) => c
case _ => ' '
}
split (" ")
filter (_.trim() != "")
filter (_.length() > 1)
map (_.toLowerCase())
filter (!stopWordsList.value.contains(_))
)
val wordFreqDist = parsedTrainUnsup flatMap {
case (x, y) => y
} map (w => (w, 1)) reduceByKey (_ + _)
val posItems = (posWords map ((_, -1))) join wordFreqDist mapValues { case (x, y) => y}
val sortedPosItems = posItems map (_.swap) sortByKey (false) map (_.swap) //This is not useful now...
val negItems = (negWords map ((_, -1))) join wordFreqDist mapValues { case (x, y) => y}
val sortedNegItems = negItems map (_.swap) sortByKey (false) map (_.swap) //This is not useful now...
//Get the top 25 hot items
//implicit val is for top(25), defining sort on the 2nd element
implicit val pairSortByValue = new Ordering[(String, Int)] {
override def compare(a: (String, Int), b: (String, Int)) = a._2 compare b._2
}
println("Top 25 positive words in unsup dataset")
posItems.top(25).foreach(println)
println("Top 25 negative words in unsup dataset")
negItems.top(25).foreach(println)
sc.stop()
}
}
如果我使用spark-submit,这段代码运行良好。
但是当我在IntelliJ IDEA中直接运行它时会抛出异常(菜单:运行>运行...)。在调查之后,似乎val inTrainUnsup = sc.wholeTextFiles(“src / main / resources / reviews / train / unsup”)出现问题,因为当我只是做inTrainUnsup.saveAsTextFile(“test file”)时,它会抛出同样的例外。
14/11/11 10:21:07 ERROR executor.Executor: Exception in task 0.0 in stage 1.0 (TID 4)
java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.initNextRecordReader(CombineFileRecordReader.java:164)
at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.<init>(CombineFileRecordReader.java:126)
at org.apache.spark.input.WholeTextFileInputFormat.createRecordReader(WholeTextFileInputFormat.scala:44)
at org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:115)
at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:103)
at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:54)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:177)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:695)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:39)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:27)
at java.lang.reflect.Constructor.newInstance(Constructor.java:513)
at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.initNextRecordReader(CombineFileRecordReader.java:155)
... 16 more
Caused by: java.lang.IncompatibleClassChangeError: Found class org.apache.hadoop.mapreduce.TaskAttemptContext, but interface was expected
at org.apache.spark.input.WholeTextFileRecordReader.<init>(WholeTextFileRecordReader.scala:40)
... 21 more
14/11/11 10:21:07 ERROR executor.Executor: Exception in task 1.0 in stage 1.0 (TID 5)
java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.initNextRecordReader(CombineFileRecordReader.java:164)
at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.<init>(CombineFileRecordReader.java:126)
at org.apache.spark.input.WholeTextFileInputFormat.createRecordReader(WholeTextFileInputFormat.scala:44)
at org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:115)
at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:103)
at org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:54)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:177)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
at java.lang.Thread.run(Thread.java:695)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:39)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:27)
at java.lang.reflect.Constructor.newInstance(Constructor.java:513)
at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.initNextRecordReader(CombineFileRecordReader.java:155)
... 16 more
Caused by: java.lang.IncompatibleClassChangeError: Found class org.apache.hadoop.mapreduce.TaskAttemptContext, but interface was expected
at org.apache.spark.input.WholeTextFileRecordReader.<init>(WholeTextFileRecordReader.scala:40)
... 21 more
14/11/11 10:21:07 WARN scheduler.TaskSetManager: Lost task 1.0 in stage 1.0 (TID 5, localhost): java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.initNextRecordReader(CombineFileRecordReader.java:164)
org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.<init>(CombineFileRecordReader.java:126)
org.apache.spark.input.WholeTextFileInputFormat.createRecordReader(WholeTextFileInputFormat.scala:44)
org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:115)
org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:103)
org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:65)
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
org.apache.spark.scheduler.Task.run(Task.scala:54)
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:177)
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
java.lang.Thread.run(Thread.java:695)
14/11/11 10:21:07 ERROR scheduler.TaskSetManager: Task 1 in stage 1.0 failed 1 times; aborting job
14/11/11 10:21:07 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
14/11/11 10:21:07 INFO scheduler.TaskSetManager: Lost task 0.0 in stage 1.0 (TID 4) on executor localhost: java.lang.RuntimeException (java.lang.reflect.InvocationTargetException) [duplicate 1]
14/11/11 10:21:07 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
14/11/11 10:21:07 INFO scheduler.TaskSchedulerImpl: Cancelling stage 1
14/11/11 10:21:07 INFO scheduler.DAGScheduler: Failed to run saveAsTextFile at MainDriver.scala:30
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 1.0 failed 1 times, most recent failure: Lost task 1.0 in stage 1.0 (TID 5, localhost): java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.initNextRecordReader(CombineFileRecordReader.java:164)
org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.<init>(CombineFileRecordReader.java:126)
org.apache.spark.input.WholeTextFileInputFormat.createRecordReader(WholeTextFileInputFormat.scala:44)
org.apache.spark.rdd.NewHadoopRDD$$anon$1.<init>(NewHadoopRDD.scala:115)
org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:103)
org.apache.spark.rdd.NewHadoopRDD.compute(NewHadoopRDD.scala:65)
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
org.apache.spark.scheduler.Task.run(Task.scala:54)
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:177)
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
java.lang.Thread.run(Thread.java:695)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1185)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1174)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1173)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1173)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:688)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:688)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:688)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1391)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)