我是Scala& amp;的新手火花。我正在尝试阅读文本文件并使用spark将其写入Avro。有人可以帮助我解决我的错误吗?
PS:我想使用spark core
由于
戒
class MyRegistrator extends KryoRegistrator {
override def registerClasses(kryo: Kryo) {
kryo.register(classOf[GenericData.Record])
}
}
object Write {
def main(args: Array[String]) {
val outPath = args(0)
val sconf = new SparkConf().setAppName("Spark Avro")
sconf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sconf.set("spark.kryo.registrator", "MyRegistrator")
val sc = new SparkContext(sconf)
val schema = new Parser().parse(this.getClass.getClassLoader
.getResourceAsStream("sample.avsc"))
val user = sc.textFile("hdfs://nameservice1/user/harsha/sample.csv");
val withValues = user.map((x) => (new AvroKey(x), NullWritable.get))
val writeJob = new Job()
AvroJob.setOutputKeySchema(writeJob, schema)
FileOutputFormat.setOutputPath(writeJob, new Path(outPath))
writeJob.setOutputFormatClass(classOf[AvroKeyOutputFormat[Any]])
withValues.saveAsNewAPIHadoopDataset(writeJob.getConfiguration)
}
}
但我得到的错误 -
15/11/16 15:59:10 INFO HadoopRDD: Input split: hdfs://nameservice1/user/harsha/sample.csv:0+84
15/11/16 15:59:10 INFO HadoopRDD: Input split: hdfs://nameservice1/user/harsha/sample.csv:84+84
15/11/16 15:59:10 INFO deprecation: mapred.tip.id is deprecated. Instead, use mapreduce.task.id
15/11/16 15:59:10 INFO deprecation: mapred.task.id is deprecated. Instead, use mapreduce.task.attempt.id
15/11/16 15:59:10 INFO deprecation: mapred.task.is.map is deprecated. Instead, use mapreduce.task.ismap
15/11/16 15:59:10 INFO deprecation: mapred.task.partition is deprecated. Instead, use mapreduce.task.partition
15/11/16 15:59:10 INFO deprecation: mapred.job.id is deprecated. Instead, use mapreduce.job.id
15/11/16 15:59:11 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
15/11/16 15:59:11 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
15/11/16 15:59:11 ERROR Executor: Exception in task 1.0 in stage 0.0 (TID 1)
org.apache.avro.file.DataFileWriter$AppendWriteException: java.lang.NullPointerException: in User null of User
at org.apache.avro.file.DataFileWriter.append(DataFileWriter.java:296)
at org.apache.avro.mapreduce.AvroKeyRecordWriter.write(AvroKeyRecordWriter.java:77)
at org.apache.avro.mapreduce.AvroKeyRecordWriter.write(AvroKeyRecordWriter.java:39)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$12.apply(PairRDDFunctions.scala:1000)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$12.apply(PairRDDFunctions.scala:979)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.NullPointerException: in User null of User
at org.apache.avro.reflect.ReflectDatumWriter.write(ReflectDatumWriter.java:145)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:58)
at org.apache.avro.file.DataFileWriter.append(DataFileWriter.java:290)
... 10 more
这是我的架构文件
{
"type" : "record",
"name" : "User",
"fields" : [ {
"name" : "name",
"type" : "string"
}, {
"name" : "st_num",
"type" : "int"
}, {
"name" : "st_name",
"type" : "string"
}, {
"name" : "apt_num",
"type" : "string"
}, {
"name" : "price",
"type" : "string"
} ]
}