Spark无法使用过滤器

时间:2017-07-18 12:39:06

标签: scala apache-spark serialization

  

线程中的异常" main" org.apache.spark.SparkException:任务没有   可序列化的   org.apache.spark.util.ClosureCleaner $ .ensureSerializable(ClosureCleaner.scala:304)     在   org.apache.spark.util.ClosureCleaner $ .ORG $阿帕奇$火花$ UTIL $ ClosureCleaner $$干净(ClosureCleaner.scala:294)     在   org.apache.spark.util.ClosureCleaner $清洁机壳(ClosureCleaner.scala:122)     在org.apache.spark.SparkContext.clean(SparkContext.scala:2055)at   org.apache.spark.SparkContext.runJob(SparkContext.scala:1857)at at   org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)at at   org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1.apply(RDD.scala:927)at at   org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:150)     在   org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:111)     在org.apache.spark.rdd.RDD.withScope(RDD.scala:316)at   org.apache.spark.rdd.RDD.collect(RDD.scala:926)at   org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2.适用(DateFormatConstraint.scala:32)     在   org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2.适用(DateFormatConstraint.scala:16)     在   org.exadatum.ddq.core.Runner $$ anonfun $运行$ 1 $$ anonfun $ 3.apply(Runner.scala:22)     在   org.exadatum.ddq.core.Runner $$ anonfun $运行$ 1 $$ anonfun $ 3.apply(Runner.scala:22)     在   scala.collection.TraversableLike $$ anonfun $表$ 1.适用(TraversableLike.scala:244)     在   scala.collection.TraversableLike $$ anonfun $表$ 1.适用(TraversableLike.scala:244)     在scala.collection.immutable.List.foreach(List.scala:318)at   scala.collection.TraversableLike $ class.map(TraversableLike.scala:244)     在scala.collection.AbstractTraversable.map(Traversable.scala:105)     在org.exadatum.ddq.core.Runner $$ anonfun $ run $ 1.apply(Runner.scala:22)     在org.exadatum.ddq.core.Runner $$ anonfun $ run $ 1.apply(Runner.scala:20)     在   scala.collection.TraversableLike $$ anonfun $表$ 1.适用(TraversableLike.scala:244)     在   scala.collection.TraversableLike $$ anonfun $表$ 1.适用(TraversableLike.scala:244)     在scala.collection.immutable.List.foreach(List.scala:318)at   scala.collection.TraversableLike $ class.map(TraversableLike.scala:244)     在scala.collection.AbstractTraversable.map(Traversable.scala:105)     在org.exadatum.ddq.core.Runner $ .run(Runner.scala:20)at at   org.exadatum.ddq.core.RunCheck。(RunCheck.scala:104)at   org.exadatum.ddq.core.DQJobTrigger $ .main(DQJobTrigger.scala:39)at at   org.exadatum.ddq.core.DQJobTrigger.main(DQJobTrigger.scala)at at   sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)at   sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)     在   sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)     在java.lang.reflect.Method.invoke(Method.java:498)at   org.apache.spark.deploy.SparkSubmit $ .ORG $阿帕奇$火花$部署$ SparkSubmit $$ runMain(SparkSubmit.scala:731)     在   org.apache.spark.deploy.SparkSubmit $ .doRunMain $ 1(SparkSubmit.scala:181)     在org.apache.spark.deploy.SparkSubmit $ .submit(SparkSubmit.scala:206)     在org.apache.spark.deploy.SparkSubmit $ .main(SparkSubmit.scala:121)     在org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)导致   by:java.io.NotSerializableException:org.apache.spark.SparkContext   序列化堆栈:      - 对象不可序列化(类:org.apache.spark.SparkContext,值:org.apache.spark.SparkContext@1d9bd4d6)      - field(class:org.exadatum.ddq.constraints.DateFormatConstraint,name:sc,type:class org.apache.spark.SparkContext)      - object(class org.exadatum.ddq.constraints.DateFormatConstraint,DateFormatConstraint(startdate,java.text.SimpleDateFormat @ 4f76f1a0,org.apache.spark.SparkContext @ 1d9bd4d6,xdqdemo.customer_details))      - field(class:org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2,name:   $ outer,type:class org.exadatum.ddq.constraints.DateFormatConstraint)      - object(类org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2,   )      - field(class:org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2 $$ anonfun $ 3,   name:$ outer,type:class   org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2)      - object(类org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2 $$ anonfun $ 3,   )      - field(class:org.apache.spark.sql.catalyst.expressions.ScalaUDF $$ anonfun $ 2,name:   func $ 2,输入:interface scala.Function1)      - object(类org.apache.spark.sql.catalyst.expressions.ScalaUDF $$ anonfun $ 2,   )      - field(class:org.apache.spark.sql.catalyst.expressions.ScalaUDF,name:f,type:interface scala.Function1)      - object(类org.apache.spark.sql.catalyst.expressions.ScalaUDF,UDF(startdate#2))      - writeObject data(class:scala.collection.immutable。$ colon $ colon)      - object(类scala.collection.immutable。$冒号$冒号,列表(UDF(​​startdate#2)))      - field(class:org.apache.spark.sql.execution.columnar.InMemoryColumnarTableScan,   name:谓词,类型:interface scala.collection.Seq)      - object(类org.apache.spark.sql.execution.columnar.InMemoryColumnarTableScan,   InMemoryColumnarTableScan [phone_number#0,name#1,startdate#2],   [UDF(startdate#2)],InMemoryRelation   [phone_number#0,name#1,startdate#2],true,10000,StorageLevel(false,   true,false,true,1),ConvertToUnsafe,None)      - field(类:org.apache.spark.sql.execution.columnar.InMemoryColumnarTableScan $$ anonfun $ doExecute $ 1,   name:$ outer,type:class   org.apache.spark.sql.execution.columnar.InMemoryColumnarTableScan)      - object(类org.apache.spark.sql.execution.columnar.InMemoryColumnarTableScan $$ anonfun $ doExecute $ 1,   )      - field(class:org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1,name:f $ 22,   type:interface scala.Function1)      - object(类org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1,   )      - field(class:org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1 $$ anonfun $ apply $ 21,   name:$ outer,type:class   org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1)      - 对象(类org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1 $$ anonfun $ apply $ 21,   )      - field(class:org.apache.spark.rdd.MapPartitionsRDD,name:f,type:interface scala.Function3)      - 对象(类org.apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [8] at atdd at DateFormatConstraint.scala:32)      - field(class:org.apache.spark.NarrowDependency,name: rdd,type:class org.apache.spark.rdd.RDD)      - object(类org.apache.spark.OneToOneDependency,org.apache.spark.OneToOneDependency@316975be)      - writeObject data(class:scala.collection.immutable。$ colon $ colon)      - object(类scala.collection.immutable。$冒号$冒号,列表(org.apache.spark.OneToOneDependency@316975be))      - field(class:org.apache.spark.rdd.RDD,name:org $ apache $ spark $ rdd $ RDD $$ dependencies ,type:interface   scala.collection.Seq)      - 对象(类org.apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [9] at atdd at DateFormatConstraint.scala:32)      - field(class:org.apache.spark.NarrowDependency,name: rdd,type:class org.apache.spark.rdd.RDD)      - object(类org.apache.spark.OneToOneDependency,org.apache.spark.OneToOneDependency@526fbb80)      - writeObject data(class:scala.collection.immutable。$ colon $ colon)      - object(类scala.collection.immutable。$冒号$冒号,列表(org.apache.spark.OneToOneDependency@526fbb80))      - field(class:org.apache.spark.rdd.RDD,name:org $ apache $ spark $ rdd $ RDD $$ dependencies ,type:interface   scala.collection.Seq)      - 对象(类org.apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [10] at atddd at DateFormatConstraint.scala:32)      - field(类:org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1,name:$ outer,type:class org.apache.spark.rdd.RDD)      - object(类org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1,)      - field(class:org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1 $$ anonfun $ 12,name:$ outer,   类型:class org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1)      - 对象(类org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1 $$ anonfun $ 12,)     在   org.apache.spark.serializer.SerializationDebugger $ .improveException(SerializationDebugger.scala:40)     在   org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)     在   org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)     在   org.apache.spark.util.ClosureCleaner $ .ensureSerializable(ClosureCleaner.scala:301)     ... 39更多

CODE SNIPPET:

val fun = (df: DataFrame) => {

format.setLenient(false)
val cannotBeDate = udf((column: String) => column != null && Try(format.parse(column)).isFailure)
val maybeCannotBeDateCount = Try(df.filter(cannotBeDate(new Column(columnName))).count);

/** Utility to persist all of the bad records   **/

val hiveContext = new HiveContext(sc)
import hiveContext.implicits._

//Writing all Bad records
//val intermediateYriteToHiveDf = df.filter(cannotBeDate(new Column(columnName)))
val writeToHiveDf = df.filter(cannotBeDate(new Column(columnName)))

var recordLists = new ListBuffer[List[(String, String, String)]]()
writeToHiveDf.rdd.collect().foreach {
  row =>
    val item = row.mkString("-")
    val recordList: List[(String, String, String)] = List(List(tableName, "ALWAYS_NULL_CONSTRAINT", item))
      .map { case List(a, b, c) => (a, b, c) }
    recordLists += recordList
}
val listRDD = sc.parallelize(recordLists.flatten)
val dataFrameToHive: DataFrame = listRDD.toDF("table_name", "constraint_applied", "data")
dataFrameToHive.write.mode("append").saveAsTable("xdqdemo.bad_records")



DateFormatConstraintResult(
  this,
  data = maybeCannotBeDateCount.toOption.map(DateFormatConstraintResultData),
  status = ConstraintUtil.tryToStatus[Long](maybeCannotBeDateCount, _ == 0)
)

}

1 个答案:

答案 0 :(得分:0)

 object checkConstraint extends Serializable{
  def checkDateFormat(format: SimpleDateFormat,df: DataFrame): DataFrame = {
    format.setLenient(false)
    val checkDateFormat = (column: String) => Try(format.parse(column)).isFailure
    val cannotBeDate = udf((column: String) => column != null && checkDateFormat(column))
    df.filter(cannotBeDate(new Column(columnName)))
  }
}


val writeToHiveDf = checkConstraint.checkDateFormat(format,df)

因此所有计算都打包在一个单例对象中,该对象返回一个必需的数据帧