我使用spark和scala,从蜂巢中的名为persons
的表中获取数据,当我调用rdd的name
时,该表有一个列foreach
,但发生异常。错误是:Caused by: java.lang.ClassNotFoundException: test.RDDForEach$$anonfun$main$1
我要做的是从配置单元表中打印每个人的名字。 一般来说,我只想使用spark从蜂巢中获取数据并进行打印。其他任何一种方法都可以。
package test
import scala.collection.mutable.ListBuffer
import org.slf4j.LoggerFactory
import com.typesafe.config._
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkConf
import scala.reflect.api.materializeTypeTag
import com.mongodb.spark._
import org.bson._
import com.mongodb.spark.config._
import com.github.nscala_time.time.Imports._
object RDDForEach {
private val log = LoggerFactory.getLogger(this.getClass)
private val conf = ConfigFactory.load()
private val databaseName = conf.getString ("mongodb.databasename")
private val collection = conf.getString ("mongodb.collection")
private val mongouri_beehive = conf.getString ("mongodb.mongouri_beehive")
private val mongouri_tushare = conf.getString ("mongodb.mongouri_tushare")
private val mongouri_datamining = conf.getString ("mongodb.mongouri_dataming")
private val jar_location= conf.getString("hdfs.jar_location")
private val hadoop_user= conf.getString("hadoop.user")
System.setProperty("HADOOP_USER_NAME",hadoop_user)
System.setProperty("SPARK_YARN_MODE", "yarn")
def main(args: Array[String]){
var sparkConf = new SparkConf()
.setAppName("writeAddrMetaData")
.set("spark.mongodb.input.uri",mongouri_hive)
.set("spark.mongodb.input.uri",mongouri_hh)
.set("spark.mongodb.input.database", databaseName)
.set("spark.mongodb.input.collection", collection)
.setMaster("yarn-client")
.set("spark.executor.memory", "1g")
.set("spark.executor.cores", "1")
.set("spark.cores.max", "2")
.set("spark.driver.maxResultSize", "1g")
.set("spark.driver.memory", "1g")
.set("spark.yarn.dist.files", "src\\main\\resources\\yarn-site.xml, src\\main\\resources\\resource-types.xml" )
.set("spark.yarn.jars", jar_location)
.set("spark.files", "src\\main\\resources\\hdfs-site.xml,src\\main\\resources\\core-site.xml" )
.set("spark.yarn.jars", jar_location)
val builder = SparkSession.builder().config(sparkConf).enableHiveSupport()
val ss = builder.getOrCreate()
val sc = ss.sparkContext
import ss.implicits._
val df= ss.sql("select name from persons");
df.rdd.foreach(f=>println(f.getString(0)));
}
}
例外是:
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:919)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:919)
at delme.RDDForEach$.main(RDDForEach.scala:56)
at delme.RDDForEach.main(RDDForEach.scala)
Caused by: java.lang.ClassNotFoundException: delme.RDDForEach$$anonfun$main$1
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1866)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1749)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2040)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1571)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2285)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2209)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2067)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1571)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2285)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2209)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2067)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1571)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2285)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2209)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2067)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1571)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:80)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)