我正在使用hdp sandbox 2.6,spark版本2.2.0 scala版本2.11.8我想在zeppelin netobook中使用spark从mongodb读取数据,所以我包含了longo-spark-connector2_2.11-2.2.0和mongo我尝试时在spark2-client / jar中使用-java-driver 3.5.0:
import com.mongodb.spark.MongoSpark
import com.mongodb.spark.config.{ReadConfig, WriteConfig}
import com.mongodb.spark.sql._
import org.apache.spark.sql.functions._
import org.bson.Document
val readConfig = ReadConfig(Map("readPreference.name" -> "secondaryPreferred") ,Some(ReadConfig(Map("uri" -> "mongodb://127.0.0.1:27017/", "database" -> "test","collection" -> "collections"))))
val zipDf = spark.sparkSession.read.mongo(readConfig)
我得到了:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage 6.0 (TID 27, sandbox-hdp.hortonworks.com, executor 1): java.lang.ClassNotFoundException: com.mongodb.spark.rdd.partitioner.MongoPartition
at org.apache.spark.repl.ExecutorClassLoader.findClass(ExecutorClassLoader.scala:82)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1858)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1744)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2032)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2277)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2201)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2059)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1566)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:426)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:312)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
任何想法和谢谢