当我从GenericRecords(avro)创建RDD时,立即收集它并打印那些我收到错误字段值的记录 - 以奇怪的方式修改: 该字段的所有值的值都等于模式之前的第一个字段,即
def createGenericRecord(first: String, second: String) = {
val schemaString =
"""
|{
| "type": "record",
| "name": "test_schema",
| "fields":[
| { "name": "test_field1", "type": "string" },
| { "name": "test_field2", "type": ["null", "string"] }
|]
|}
""".stripMargin
val parser = new Schema.Parser()
parser.setValidate(true)
parser.setValidateDefaults(true)
val schema = parser.parse(schemaString);
val genericRecord = new Record(schema)
genericRecord.put("test_field1", first)
genericRecord.put("test_field2", second)
genericRecord
}
val record1 = createGenericRecord("test1","test2")
val record2 = createGenericRecord("test3","test4")
println(record1)//prints {"test_field1": "test1", "test_field2": "test2"}
println(record2)//prints {"test_field1": "test3", "test_field2": "test4"}
val t = sc.makeRDD(Seq(record1, record2))
val collected = t.collect()
println(collected(0))//prints {"test_field1": "test1", "test_field2": "test1"}
println(collected(1))//prints {"test_field1": "test3", "test_field2": "test3"}
我使用Spark 1.2.0并将spark.serialiazier配置为org.apache.spark.serializer.KryoSerializer
答案 0 :(得分:0)
此问题的解决方案是将依赖关系更新为版本1.7.7。