简单的火花应用程序出错

时间:2015-07-23 23:01:53

标签: apache-spark

我正在运行一个简单的火花应用程序来执行向量'的单词。这是我的代码(这是来自spark网站)

import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}

object SimpleApp {
  def main(args: Array[String]) {
  val conf = new SparkConf().setAppName("Word2Vector")
  val sc = new SparkContext(conf)
  val input = sc.textFile("text8").map(line => line.split(" ").toSeq)
  val word2vec = new Word2Vec()
  val model = word2vec.fit(input)
  val synonyms = model.findSynonyms("china", 40)
  for((synonym, cosineSimilarity) <- synonyms) {
    println(s"$synonym $cosineSimilarity")
  }
  // Save and load model
  model.save(sc, "myModelPath")
 }

}

运行时它会给我以下错误消息

 Exception in thread "main" org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://GXYDEVVM:8020/user/hadoop/YOUR_SPARK_HOME/README.md
    at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:285)
    at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:228)
    at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313)
    at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:207)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
    at scala.Option.getOrElse(Option.scala:120)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
    at scala.Option.getOrElse(Option.scala:120)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217)
    at scala.Option.getOrElse(Option.scala:120)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:217)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1781)
    at org.apache.spark.rdd.RDD.count(RDD.scala:1099)
    at org.apache.spark.api.java.JavaRDDLike$class.count(JavaRDDLike.scala:442)
    at org.apache.spark.api.java.AbstractJavaRDDLike.count(JavaRDDLike.scala:47)
    at SimpleApp.main(SimpleApp.java:13)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:606)
    at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:665)
    at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:170)
    at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:193)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:112)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)

有什么问题?这个地址来自/user/hadoop/YOUR_SPARK_HOME/README.md

1 个答案:

答案 0 :(得分:0)

这可能与您的默认Spark配置有关。 在Spark主目录的conf目录中查看(或使用grep)。你应该找到一个spark-env.sh文件,它可能包含对奇怪文件的引用。 实际上,Spark正试图从HDFS加载一个文件(如果你在集群上运行Spark,那就是一种标准:你的输入/输出应该由主服务器和工作者从服务器访问)。如果在本地使用Spark,则必须使用setMaster方法配置Spark Context。这是我的版本:

object SparkDemo {

  def log[A](key:String)(job : =>A) = {
    val start = System.currentTimeMillis
    val output = job
    println("===> %s in %s seconds"
      .format(key, (System.currentTimeMillis - start) / 1000.0))
    output
  }

  def main(args: Array[String]):Unit ={

    val modelName ="w2vModel"

    val sc = new SparkContext(
      new SparkConf()
      .setAppName("SparkDemo")
      .set("spark.executor.memory", "4G")
      .set("spark.driver.maxResultSize", "16G")
      .setMaster("spark://192.168.1.53:7077") // ip of the spark master.
      // .setMaster("local[2]") // does not work... workers loose contact with the master after 120s
    )

    // take a look into target folder if you are unsure how the jar is named
    // onliner to compile / run : sbt package && sbt run
    sc.addJar("./target/scala-2.10/sparkling_2.10-0.1.jar")

    val input = sc.textFile("./text8").map(line => line.split(" ").toSeq)

    val word2vec = new Word2Vec()

    val model = log("compute model") { word2vec.fit(input) }
    log ("save model") { model.save(sc, modelName) }

    val synonyms = model.findSynonyms("china", 40)
    for((synonym, cosineSimilarity) <- synonyms) {
      println(s"$synonym $cosineSimilarity")
    }

    val model2 = log("reload model") { Word2VecModel.load(sc, modelName) }
  }
}