运行Scala MongoDB连接器时出现java.lang.ClassNotFoundException:org.apache.spark.sql.DataFrame错误

时间:2017-03-05 00:33:09

标签: scala apache-spark sbt apache-spark-sql

我正在尝试使用SBT运行Scala示例以从MongoDB读取数据。每当我尝试将从Mongo读取的数据访问到RDD时,我都会收到此错误。

Exception in thread "dag-scheduler-event-loop" java.lang.NoClassDefFoundError: org/apache/spark/sql/DataFrame
at java.lang.Class.getDeclaredMethods0(Native Method)
at java.lang.Class.privateGetDeclaredMethods(Class.java:2701)
at java.lang.Class.getDeclaredMethod(Class.java:2128)
at java.io.ObjectStreamClass.getPrivateMethod(ObjectStreamClass.java:1431)
at java.io.ObjectStreamClass.access$1700(ObjectStreamClass.java:72)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:494)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:468)
at java.security.AccessController.doPrivileged(Native Method)
at java.io.ObjectStreamClass.<init>(ObjectStreamClass.java:468)
at java.io.ObjectStreamClass.lookup(ObjectStreamClass.java:365)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1134)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)

我已经显式导入了Dataframe,即使它没有在我的代码中使用。任何人都可以帮忙解决这个问题吗?

我的代码:

package stream

import org.apache.spark._
import org.apache.spark.SparkContext._
import com.mongodb.spark._
import com.mongodb.spark.config._
import com.mongodb.spark.rdd.MongoRDD
import org.bson.Document
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.DataFrame

object SpaceWalk {

def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("SpaceWalk")
    .setMaster("local[*]")
    .set("spark.mongodb.input.uri", "mongodb://127.0.0.1/nasa.eva")
    .set("spark.mongodb.output.uri", "mongodb://127.0.0.1/nasa.astronautTotals")

    val sc = new SparkContext(sparkConf)
    val rdd = sc.loadFromMongoDB()


    def  breakoutCrew (  document: Document  ): List[(String,Int)]  = {
    println("INPUT"+document.get( "Duration").asInstanceOf[String])
      var minutes = 0;
      val timeString = document.get( "Duration").asInstanceOf[String]
      if( timeString != null && !timeString.isEmpty ) {
        val time =  document.get( "Duration").asInstanceOf[String].split( ":" )
        minutes = time(0).toInt * 60 + time(1).toInt
      }

      import scala.util.matching.Regex
      val pattern = new Regex("(\\w+\\s\\w+)")
      val names =  pattern findAllIn document.get( "Crew" ).asInstanceOf[String]
      var tuples : List[(String,Int)] = List()
      for ( name <- names ) { tuples = tuples :+ (( name, minutes ) ) }

      return tuples
    }

    val logs = rdd.flatMap( breakoutCrew ).reduceByKey( (m1: Int, m2: Int) => ( m1 + m2 ) )

    //logs.foreach(println)

    def mapToDocument( tuple: (String, Int )  ): Document = {
      val doc = new Document();
      doc.put( "name", tuple._1 )
      doc.put( "minutes", tuple._2 )

      return doc
    }

    val writeConf = WriteConfig(sc)
    val writeConfig = WriteConfig(Map("collection" -> "astronautTotals", "writeConcern.w" -> "majority", "db" -> "nasa"), Some(writeConf))

    logs.map( mapToDocument ).saveToMongoDB( writeConfig )

    import org.apache.spark.sql.SQLContext
    import com.mongodb.spark.sql._
    import org.apache.spark.sql.DataFrame

    // load the first dataframe "EVAs"
    val sqlContext = new SQLContext(sc);
    import sqlContext.implicits._
    val evadf = sqlContext.read.mongo()
    evadf.printSchema()
    evadf.registerTempTable("evas")

    // load the 2nd dataframe "astronautTotals"

    val astronautDF = sqlContext.read.option("collection", "astronautTotals").mongo[astronautTotal]()
    astronautDF.printSchema()
    astronautDF.registerTempTable("astronautTotals")

    sqlContext.sql("SELECT astronautTotals.name, astronautTotals.minutes FROM astronautTotals"  ).show()


    sqlContext.sql("SELECT astronautTotals.name, astronautTotals.minutes, evas.Vehicle, evas.Duration FROM " +
      "astronautTotals JOIN evas ON astronautTotals.name LIKE evas.Crew"  ).show()
}
}
 case class astronautTotal ( name: String, minutes: Integer )

这是我的sbt文件 -

name := "Project"
version := "1.0"    
scalaVersion := "2.11.7"    
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.0.0"
libraryDependencies += "org.apache.spark" %% "spark-streaming" % "2.0.0"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.0.0"
//libraryDependencies += "org.apache.spark" %% "spark-streaming-twitter" % "1.2.1"
libraryDependencies += "org.apache.bahir" %% "spark-streaming-twitter" % "2.0.0"
libraryDependencies += "org.mongodb.spark" %% "mongo-spark-connector" % "0.1"  

addCommandAlias("c1", "run-main stream.SaveTweets")
addCommandAlias("c2", "run-main stream.SpaceWalk")

outputStrategy := Some(StdoutOutput)
//outputStrategy := Some(LoggedOutput(log: Logger))    
fork in run := true

1 个答案:

答案 0 :(得分:5)

此错误消息是因为您使用的是仅支持Spark 1.x的不兼容库。您应该使用mongo-spark-connector 2.0.0+代替。请参阅:https://docs.mongodb.com/spark-connector/v2.0/