Question

我有以下avro架构

{
    "type":"record",
    "name":"test",
    "namespace":"test.name",
    "fields":[
        {"name":"items","type":
            {"type":"array",
                "items":
                    {"type":"record","name":"items",
                        "fields":[
                                {"name":"name","type":"string"},
                                {"name":"state","type":"string"}
                            ]
                    }
            }
        },
        {"name":"firstname","type":"string"}
    ]
}

当我使用Json解码器和avro编码器编码Json数据时：

val writer = new GenericDatumWriter[GenericRecord](schema)
val reader = new GenericDatumReader[GenericRecord](schema)
val baos = new ByteArrayOutputStream
val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(schema, json)
val encoder = EncoderFactory.get.binaryEncoder(baos, null)
val datum = reader.read(null, decoder)
writer.write(datum, encoder)
encoder.flush()
val avroByteArray = baos.toByteArray

scenario1： 当我传递跟随json编码它工作正常：

{
  "items": [
    {
      "name": "dallas",
      "state": "TX"
    }
  ],
  "firstname":"arun"
}

scenario2： 当我在根级别（lastname）传递json中的附加属性时，它能够编码并正常工作：

{
  "items": [
    {
      "name": "dallas",
      "state": "TX"
    }
  ],
  "firstname":"fname",
  "lastname":"lname"
}

场景3： 当我在数组记录（国家/地区）中添加其他属性时，它会抛出以下异常：

Expected record-end. Got FIELD_NAME
org.apache.avro.AvroTypeException: Expected record-end. Got FIELD_NAME
    at org.apache.avro.io.JsonDecoder.error(JsonDecoder.java:698)
{
  "items": [
    {
      "name": "dallas",
      "state": "TX",
      "country":"USA"
    }
  ],
  "firstname":"fname",
  "lastname":"lname"
}

我需要让场景＃3正常工作，任何帮助都会很棒。

Answer 1

使用spark数据帧方法将json数据与相应的avro架构格式进行转换可以帮助您。

使用avro架构创建结构类型 SchemaConverters
从struct type和json rdd string step
使用将数据框行转换为json 的 df.toJSON

示例测试用例：

import java.io.ByteArrayOutputStream import com.databricks.spark.avro.SchemaConverters import org.apache.avro.Schema import org.apache.avro.Schema.Parser import org.apache.avro.generic._ import org.apache.avro.io._ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession.Builder import org.apache.spark.sql._ import org.apache.spark.sql.types.StructType import org.scalatest.{Matchers, WordSpecLike} class Test extends WordSpecLike with Matchers { val schemaString: String = """{ | "type":"record", | "name":"test", | "namespace":"test.name", | "fields":[ | {"name":"items","type": | {"type":"array", | "items": | {"type":"record","name":"items", | "fields":[ | {"name":"name","type":"string"}, | {"name":"state","type":"string"} | ] | } | } | }, | {"name":"firstname","type":"string"} | ] |}""".stripMargin // create spark session and sql context val builder: Builder = SparkSession.builder.appName("testAvroSpark") val sparkSession: SparkSession = builder.master("local[1]").getOrCreate() val sc: SparkContext = sparkSession.sparkContext val sqlContext: SQLContext = sparkSession.sqlContext // avro schema from json type schema string val schema: Schema = new Parser().parse(schemaString) // get spark struct type from avro schema val requiredType: StructType = SchemaConverters.toSqlType(schema).dataType.asInstanceOf[StructType] "scenario one json data with given schema" in { val scenarioOneJson: String = """{ | "items": [ | { | "name": "dallas", | "state": "TX" | } | ], | "firstname":"rumesh" |}""".stripMargin val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioOneJson)) val outputJsonExpected: String = """{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}""" val resultJson: String = customJsonConverter(requiredType, jsonRdd).head assert(resultJson === outputJsonExpected) assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson)) } "scenario two json data with given schema" in { val scenarioTwoJson: String = """{ | "items": [ | { | "name": "dallas", | "state": "TX" | } | ], | "firstname":"rumesh", | "lastname":"krish" |}""".stripMargin val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioTwoJson)) val outputJsonExpected: String = """{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}""" val resultJson: String = customJsonConverter(requiredType, jsonRdd).head assert(resultJson === outputJsonExpected) assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson)) } "scenario three json data with given schema" in { val scenarioThreeJson: String = """{ | "items": [ | { | "name": "dallas", | "state": "TX", | "country":"USA" | } | ], | "firstname":"rumesh", | "lastname":"krish" |}""".stripMargin val jsonRdd: RDD[String] = sc.parallelize(Seq(scenarioThreeJson)) val outputJsonExpected: String = """{"items":[{"name":"dallas","state":"TX"}],"firstname":"rumesh"}""" val resultJson: String = customJsonConverter(requiredType, jsonRdd).head assert(resultJson === outputJsonExpected) assert(binaryEncoder(schema, outputJsonExpected) === binaryEncoder(schema, resultJson)) } /** * convert the json using data frame json parser with given schema struct type * * @param customType given data frame struct type * @param jsonInputRdd json rdd string * @return */ private def customJsonConverter(customType: StructType, jsonInputRdd: RDD[String]): List[String] = { // create data frame from rdd string with struct type schema val df: DataFrame = sqlContext.read.schema(customType).json(jsonInputRdd) // get the list of json string data frame df.toJSON.rdd.toLocalIterator.toList } /** * avro binary serialization * * @param avroSchema avro schema * @param jsonData json data * @return */ private def binaryEncoder(avroSchema: Schema, jsonData: String): Array[Byte] = { val writer = new GenericDatumWriter[GenericRecord](avroSchema) val reader = new GenericDatumReader[GenericRecord](avroSchema) val baos = new ByteArrayOutputStream val decoder: JsonDecoder = DecoderFactory.get.jsonDecoder(avroSchema, jsonData) val encoder = EncoderFactory.get.binaryEncoder(baos, null) val datum = reader.read(null, decoder) writer.write(datum, encoder) encoder.flush() baos.toByteArray } }

Answer 2

您的架构不代表方案3中的结构：缺少“国家/地区”字段：

{"name":"country", "type":"string"}

您仅声明字段“名称”和“状态”。然后，解码器正确地希望（sub）记录在这些记录之后结束，并且如错误消息所述，它将获得一个（另一个）字段名（“国家”）。

顺便说一句：您可以使用生成器始终从JSON中获取匹配的模式，网络上有几个可用的

。

avro json附加字段

2 个答案: