我正在尝试使用rdd.map函数处理文本文件时将一些数据检查点到新的hdfs文件中。
下面是工作代码。
object data_store {
def main(argsi: Array[String]): Unit = {
. . . .
. . . .
// Set-up checkpoint file
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val output = fs.create(new Path(rejfile))
val ck = new BufferedOutputStream(output)
ck.write("Check pointing.. ")
val mstr_rdd = spark.sparkContext.textFile(infile)
val mstr_recs = mstr_rdd.map(line => convert_data(line) )
mstr_recs.saveAsTextFile(outfile)
ck.close()
} // end of main
def convert_data(line:String):String={
val HDR = line.slice(0,4)
val ACC = line.slice(4,16)
}
但是,如果我进行如下更改,则会抛出“任务不可序列化”错误
. . . . .
. . . . .
val mstr_rdd = spark.sparkContext.textFile(infile)
val mstr_recs = mstr_rdd.map(line => convert_data(line,ck) ) // <==Changed
mstr_recs.saveAsTextFile(outfile)
ck.close()
} // end of main
def convert_data(line:String,ck:java.io.BufferedOutputStream ):String={ // <==Changed
val HDR = line.slice(0,4)
val ACC = line.slice(4,16)
ck.write(ACC + " mapped") // <==New
}
如何解决此问题?