avro json附加字段

时间:2018-01-24 22:25:45

val writer = new GenericDatumWriter[GenericRecord](schema)
val reader = new GenericDatumReader[GenericRecord](schema)
val baos = new ByteArrayOutputStream
val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(schema, json)
val encoder = EncoderFactory.get.binaryEncoder(baos, null)
val datum = reader.read(null, decoder)
writer.write(datum, encoder)
val avroByteArray = baos.toByteArray

scenario1: 当我传递跟随json编码它工作正常:

  "items": [
      "name": "dallas",
      "state": "TX"

scenario2: 当我在根级别(lastname)传递json中的附加属性时,它能够编码并正常工作:

  "items": [
      "name": "dallas",
      "state": "TX"

场景3: 当我在数组记录(国家/地区)中添加其他属性时,它会抛出以下异常:

Expected record-end. Got FIELD_NAME
org.apache.avro.AvroTypeException: Expected record-end. Got FIELD_NAME
    at org.apache.avro.io.JsonDecoder.error(JsonDecoder.java:698)
  "items": [
      "name": "dallas",
      "state": "TX",


  1. 使用avro架构创建结构类型 SchemaConverters
  2. 从struct type和json rdd string step
  3. 创建数据框
  4. 使用将数​​据框行转换为json 的 df.toJSON
  5. 示例测试用例:

    import java.io.ByteArrayOutputStream
    import com.databricks.spark.avro.SchemaConverters
    import org.apache.avro.Schema
    import org.apache.avro.Schema.Parser
    import org.apache.avro.generic._
    import org.apache.avro.io._
    import org.apache.spark.SparkContext
    import org.apache.spark.rdd.RDD
    import org.apache.spark.sql.SparkSession.Builder
    import org.apache.spark.sql._
    import org.apache.spark.sql.types.StructType
    import org.scalatest.{Matchers, WordSpecLike}
    class Test extends WordSpecLike
      with Matchers {
      val schemaString: String =
          |    "type":"record",
          |    "name":"test",
          |    "namespace":"test.name",
          |    "fields":[
          |        {"name":"items","type":
          |            {"type":"array",
          |                "items":
          |                    {"type":"record","name":"items",
          |                        "fields":[
          |                                {"name":"name","type":"string"},
          |                                {"name":"state","type":"string"}
          |                            ]
          |                    }
          |            }
          |        },
          |        {"name":"firstname","type":"string"}
          |    ]
      // create spark session and sql context
      val builder: Builder = SparkSession.builder.appName("testAvroSpark")
      val sparkSession: SparkSession = builder.master("local[1]").getOrCreate()
      val sc: SparkContext = sparkSession.sparkContext
      val sqlContext: SQLContext = sparkSession.sqlContext
      // avro schema from json type schema string
      val schema: Schema = new Parser().parse(schemaString)
      // get spark struct type from avro schema
      val requiredType: StructType =
      "scenario one json data with given schema" in {
        val scenarioOneJson: String =
            |  "items": [
            |    {
            |      "name": "dallas",
            |      "state": "TX"
            |    }
            |  ],
            |  "firstname":"rumesh"
        val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioOneJson))
        val outputJsonExpected: String =
        val resultJson: String = customJsonConverter(requiredType, jsonRdd).head
        assert(resultJson === outputJsonExpected)
        assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
      "scenario two json data with given schema" in {
        val scenarioTwoJson: String =
            |  "items": [
            |    {
            |      "name": "dallas",
            |      "state": "TX"
            |    }
            |  ],
            |  "firstname":"rumesh",
            |  "lastname":"krish"
        val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioTwoJson))
        val outputJsonExpected: String =
        val resultJson: String = customJsonConverter(requiredType, jsonRdd).head
        assert(resultJson === outputJsonExpected)
        assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
      "scenario three json data with given schema" in {
        val scenarioThreeJson: String =
            |  "items": [
            |    {
            |      "name": "dallas",
            |      "state": "TX",
            |      "country":"USA"
            |    }
            |  ],
            |  "firstname":"rumesh",
            |  "lastname":"krish"
        val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioThreeJson))
        val outputJsonExpected: String =
        val resultJson: String = customJsonConverter(requiredType, jsonRdd).head
        assert(resultJson === outputJsonExpected)
        assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
        * convert the json using data frame json parser with given schema struct type
        * @param customType   given data frame struct type
        * @param jsonInputRdd json rdd string
        * @return
      private def customJsonConverter(customType: StructType,
                                      jsonInputRdd: RDD[String]): List[String] = {
        // create data frame from rdd string with struct type schema
        val df: DataFrame = sqlContext.read.schema(customType).json(jsonInputRdd)
        // get the list of json string data frame
        * avro binary serialization
        * @param avroSchema avro schema
        * @param jsonData   json data
        * @return
      private def binaryEncoder(avroSchema: Schema, jsonData: String): Array[Byte] = {
        val writer = new GenericDatumWriter[GenericRecord](avroSchema)
        val reader = new GenericDatumReader[GenericRecord](avroSchema)
        val baos = new ByteArrayOutputStream
        val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(avroSchema, jsonData)
        val encoder = EncoderFactory.get.binaryEncoder(baos, null)
        val datum = reader.read(null, decoder)
        writer.write(datum, encoder)

答案 1 :(得分:0)


{"name":"country", "type":"string"}

