我在Zeppelin笔记中创建了一些不作为整体序列化的方法。但是在中间使用临时写入/读取文件步骤运行时工作。
my_column
确切的步骤无关紧要或我想做什么。我想知道的是当我一次运行filterViableByUsersDAG时出现序列化错误的原因。
e.g。分裂
case class Visit(key: Long, uri: String, visits: Long)
def removeURIParams(uri:String) = ...
def splitUri(visit: Visit):Seq[Visit] = { uses removeURIParams }
val fileByUser = "urisGroupedByUser.parquet"
val fileByUserTemp = "urisGroupedByUserTemp.parquet"
def filterViableByUsersDAG(minTF:Int) = {
val dfViableByDay = spark.read.parquet(mltrainingBasePath + fileByDay)
val df = dfVisits
.select($"uuid".as("key"),$"uri")
.join(dfViableByDay.select($"original".as("uri")),"uri")
.groupBy($"key",$"uri")
.agg(count($"uri").as("visits"))
.as[Visit]
.flatMap(splitUri)
.groupBy($"key", $"uri")
.agg(sum($"visits").as("visits"))
.groupBy($"uri").agg(count($"key").as("count")).where($"count" > minTF).repartition(1)
df.write.mode("overwrite").parquet(mltrainingBasePath + fileByUser)
df
}
完全相同的代码在没有任何序列化问题的情况下工作。但是我在每一步之间写了一个临时拼花文件。这不是解决方案。
错误是:
def filterViableByUsers(minTF:Int) = {
val dfViableByDay = spark.read.parquet(mltrainingBasePath + fileByDay)
dfVisits
.select($"uuid".as("key"),$"uri")
.join(dfViableByDay.select($"original".as("uri")),"uri")
.groupBy($"key",$"uri")
.agg(count($"uri").as("visits"))
.write.mode("overwrite").parquet(mltrainingBasePath + fileByDayTemp)
spark.read.parquet(mltrainingBasePath + fileByDayTemp)
.as[Visit]
.flatMap(splitUri)
.write.mode("overwrite").parquet(mltrainingBasePath + fileByUserSplit)
val df = spark.read.parquet(mltrainingBasePath + fileByUserSplit)
.groupBy($"key", $"uri")
.agg(sum($"visits").as("visits"))
.groupBy($"uri").agg(count($"key").as("count")).where($"count" > minTF).repartition(1)
df.write.mode("overwrite").parquet(mltrainingBasePath + fileByUser)
df
}
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
...
Caused by: java.io.NotSerializableException: org.apache.spark.sql.Column
Serialization stack:
- object not serializable (class: org.apache.spark.sql.Column, value: uri)
- element of array (index: 0)
- array (class [Lorg.apache.spark.sql.Column;, size 3)
- field (class: $line58092229620.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw, name: columns, type: class [Lorg.apache.spark.sql.Column;)
...