avro json附加字段

时间:2018-01-24 22:25:45

标签: avro spark-avro

我有以下avro架构

{
    "type":"record",
    "name":"test",
    "namespace":"test.name",
    "fields":[
        {"name":"items","type":
            {"type":"array",
                "items":
                    {"type":"record","name":"items",
                        "fields":[
                                {"name":"name","type":"string"},
                                {"name":"state","type":"string"}
                            ]
                    }
            }
        },
        {"name":"firstname","type":"string"}
    ]
}

当我使用Json解码器和avro编码器编码Json数据时:

val writer = new GenericDatumWriter[GenericRecord](schema)
val reader = new GenericDatumReader[GenericRecord](schema)
val baos = new ByteArrayOutputStream
val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(schema, json)
val encoder = EncoderFactory.get.binaryEncoder(baos, null)
val datum = reader.read(null, decoder)
writer.write(datum, encoder)
encoder.flush()
val avroByteArray = baos.toByteArray

scenario1: 当我传递跟随json编码它工作正常:

{
  "items": [
    {
      "name": "dallas",
      "state": "TX"
    }
  ],
  "firstname":"arun"
}

scenario2: 当我在根级别(lastname)传递json中的附加属性时,它能够编码并正常工作:

{
  "items": [
    {
      "name": "dallas",
      "state": "TX"
    }
  ],
  "firstname":"fname",
  "lastname":"lname"
}

场景3: 当我在数组记录(国家/地区)中添加其他属性时,它会抛出以下异常:

Expected record-end. Got FIELD_NAME
org.apache.avro.AvroTypeException: Expected record-end. Got FIELD_NAME
    at org.apache.avro.io.JsonDecoder.error(JsonDecoder.java:698)
{
  "items": [
    {
      "name": "dallas",
      "state": "TX",
      "country":"USA"
    }
  ],
  "firstname":"fname",
  "lastname":"lname"
}

我需要让场景#3正常工作,任何帮助都会很棒。

2 个答案:

答案 0 :(得分:0)

使用spark数据帧方法将json数据与相应的avro架构格式进行转换可以帮助您。

  1. 使用avro架构创建结构类型 SchemaConverters
  2. 从struct type和json rdd string step
  3. 创建数据框
  4. 使用将数​​据框行转换为json 的 df.toJSON
  5. 示例测试用例:

    import java.io.ByteArrayOutputStream
    
    import com.databricks.spark.avro.SchemaConverters
    import org.apache.avro.Schema
    import org.apache.avro.Schema.Parser
    import org.apache.avro.generic._
    import org.apache.avro.io._
    import org.apache.spark.SparkContext
    import org.apache.spark.rdd.RDD
    import org.apache.spark.sql.SparkSession.Builder
    import org.apache.spark.sql._
    import org.apache.spark.sql.types.StructType
    import org.scalatest.{Matchers, WordSpecLike}
    
    class Test extends WordSpecLike
      with Matchers {
    
      val schemaString: String =
        """{
          |    "type":"record",
          |    "name":"test",
          |    "namespace":"test.name",
          |    "fields":[
          |        {"name":"items","type":
          |            {"type":"array",
          |                "items":
          |                    {"type":"record","name":"items",
          |                        "fields":[
          |                                {"name":"name","type":"string"},
          |                                {"name":"state","type":"string"}
          |                            ]
          |                    }
          |            }
          |        },
          |        {"name":"firstname","type":"string"}
          |    ]
          |}""".stripMargin
    
      // create spark session and sql context
      val builder: Builder = SparkSession.builder.appName("testAvroSpark")
      val sparkSession: SparkSession = builder.master("local[1]").getOrCreate()
      val sc: SparkContext = sparkSession.sparkContext
      val sqlContext: SQLContext = sparkSession.sqlContext
    
      // avro schema from json type schema string
      val schema: Schema = new Parser().parse(schemaString)
    
      // get spark struct type from avro schema
      val requiredType: StructType =
        SchemaConverters.toSqlType(schema).dataType.asInstanceOf[StructType]
    
      "scenario one json data with given schema" in {
        val scenarioOneJson: String =
          """{
            |  "items": [
            |    {
            |      "name": "dallas",
            |      "state": "TX"
            |    }
            |  ],
            |  "firstname":"rumesh"
            |}""".stripMargin
    
        val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioOneJson))
    
        val outputJsonExpected: String =
          """{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}"""
    
        val resultJson: String = customJsonConverter(requiredType, jsonRdd).head
    
        assert(resultJson === outputJsonExpected)
        assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
      }
    
      "scenario two json data with given schema" in {
        val scenarioTwoJson: String =
          """{
            |  "items": [
            |    {
            |      "name": "dallas",
            |      "state": "TX"
            |    }
            |  ],
            |  "firstname":"rumesh",
            |  "lastname":"krish"
            |}""".stripMargin
    
        val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioTwoJson))
    
        val outputJsonExpected: String =
          """{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}"""
    
        val resultJson: String = customJsonConverter(requiredType, jsonRdd).head
    
        assert(resultJson === outputJsonExpected)
        assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
      }
    
      "scenario three json data with given schema" in {
        val scenarioThreeJson: String =
          """{
            |  "items": [
            |    {
            |      "name": "dallas",
            |      "state": "TX",
            |      "country":"USA"
            |    }
            |  ],
            |  "firstname":"rumesh",
            |  "lastname":"krish"
            |}""".stripMargin
    
        val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioThreeJson))
    
        val outputJsonExpected: String =
          """{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}"""
    
        val resultJson: String = customJsonConverter(requiredType, jsonRdd).head
    
        assert(resultJson === outputJsonExpected)
        assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson))
      }
    
      /**
        * convert the json using data frame json parser with given schema struct type
        *
        * @param customType   given data frame struct type
        * @param jsonInputRdd json rdd string
        * @return
        */
      private def customJsonConverter(customType: StructType,
                                      jsonInputRdd: RDD[String]): List[String] = {
        // create data frame from rdd string with struct type schema
        val df: DataFrame = sqlContext.read.schema(customType).json(jsonInputRdd)
    
        // get the list of json string data frame
        df.toJSON.rdd.toLocalIterator.toList
      }
    
    
      /**
        * avro binary serialization
        *
        * @param avroSchema avro schema
        * @param jsonData   json data
        * @return
        */
      private def binaryEncoder(avroSchema: Schema, jsonData: String): Array[Byte] = {
        val writer = new GenericDatumWriter[GenericRecord](avroSchema)
        val reader = new GenericDatumReader[GenericRecord](avroSchema)
        val baos = new ByteArrayOutputStream
        val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(avroSchema, jsonData)
        val encoder = EncoderFactory.get.binaryEncoder(baos, null)
        val datum = reader.read(null, decoder)
        writer.write(datum, encoder)
        encoder.flush()
        baos.toByteArray
      }
    
    }
    

答案 1 :(得分:0)

您的架构不代表方案3中的结构:缺少“国家/地区”字段:

{"name":"country", "type":"string"}

您仅声明字段“名称”和“状态”。然后,解码器正确地希望(sub)记录在这些记录之后结束,并且如错误消息所述,它将获得一个(另一个)字段名(“国家”)。

顺便说一句:您可以使用生成器始终从JSON中获取匹配的模式,网络上有几个可用的