Question

当我从GenericRecords（avro）创建RDD时，立即收集它并打印那些我收到错误字段值的记录 - 以奇怪的方式修改：该字段的所有值的值都等于模式之前的第一个字段，即

def createGenericRecord(first: String, second: String) = {
      val schemaString =
      """
        |{
        | "type": "record",
        | "name": "test_schema",
        | "fields":[
          | { "name": "test_field1", "type": "string" },
          | { "name": "test_field2", "type": ["null", "string"] }
        |]
        |}
      """.stripMargin
    val parser = new Schema.Parser()
    parser.setValidate(true)
    parser.setValidateDefaults(true)
    val schema = parser.parse(schemaString);
    val genericRecord = new Record(schema)
    genericRecord.put("test_field1", first)
    genericRecord.put("test_field2", second)
    genericRecord
}

val record1 = createGenericRecord("test1","test2")
val record2 = createGenericRecord("test3","test4")

println(record1)//prints {"test_field1": "test1", "test_field2": "test2"}
println(record2)//prints {"test_field1": "test3", "test_field2": "test4"} 

val t = sc.makeRDD(Seq(record1, record2))
val collected = t.collect()
println(collected(0))//prints {"test_field1": "test1", "test_field2": "test1"}
println(collected(1))//prints {"test_field1": "test3", "test_field2": "test3"}

我使用Spark 1.2.0并将spark.serialiazier配置为org.apache.spark.serializer.KryoSerializer

Answer 1

此问题的解决方案是将依赖关系更新为版本1.7.7。

从spark中的GenericRecord序列创建RDD将改变通用记录中的字段值

1 个答案: