Spark - 为嵌套的Json指定架构

时间:2015-06-25 11:47:21

标签: apache-spark

使用spark 1.2.0

您好,

我想将数据从kafka流保存到镶木地板。 使用jsonRDD创建表时,将模式应用于JSON数据集。 如此处所述https://databricks.com/blog/2015/02/02/an-introduction-to-json-support-in-spark-sql.html

数据来自Kafka,并以嵌套的json 形式出现。

这是一个从文本文件中读取的基本示例,用于说明如何为非嵌套json指定模式。

    //contents of json
    hdfs@2db12:~$ hadoop fs -cat User/names.json
    {"name":"Michael", "age":10}
    {"name":"Andy", "age":30}
    {"name":"Justin"}

    //create RDD from json
    scala> val names= sc.textFile("hdfs://10.0.11.8:8020/user/hdfs/User/names.json")
    scala> names.collect().foreach(println)
    {"name":"Michael", "age":10}
    {"name":"Andy", "age":30}
    {"name":"Justin"}

    // specify schema
    val schemaString = "name age gender"
    val schema =
    StructType(
    schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))

    val peopleSchemaRDD = sqlContext.jsonRDD(names, schema)

   scala> peopleSchemaRDD.printSchema()
   root
   |-- name: string (nullable = true)
   |-- age: string (nullable = true)
   |-- gender: string (nullable = true)

   scala> peopleSchemaRDD.registerTempTable("people")

   scala> sqlContext.sql("SELECT name,age,gender FROM   people").collect().foreach(println)
   [Michael,10,null]
   [Andy,30,null]
   [Justin,null,null]

是否可以为嵌套的json指定架构? 例如,像这样的json        {“filename”:“details”,“attributes”:{“name”:“Michael”,“age”:10}}

非常感谢

2 个答案:

答案 0 :(得分:2)

如果您至少有一个带有性别字段的json,则可以使用sqlContext.jsonFile()

或详细定义架构

val schema = StructType( 
  StructField("filename", StringType, true) ::
  StructField(
    "attributes",
    StructType(schemaString.split(" ").map(fieldName => 
      StructField(fieldName, StringType, true)
    ))
  ) :: Nil
)

答案 1 :(得分:2)

java版本..以下链接帮助我

create nested dataframe programmatically with Spark

public static void main(String[] args) throws AnalysisException {
    String master = "local[*]";

    List<StructField> employeeFields = new ArrayList<>();
    employeeFields.add(DataTypes.createStructField("firstName", DataTypes.StringType, true));
    employeeFields.add(DataTypes.createStructField("lastName", DataTypes.StringType, true));
    employeeFields.add(DataTypes.createStructField("email", DataTypes.StringType, true));

    List<StructField> addressFields = new ArrayList<>();
    addressFields.add(DataTypes.createStructField("city", DataTypes.StringType, true));
    addressFields.add(DataTypes.createStructField("state", DataTypes.StringType, true));
    addressFields.add(DataTypes.createStructField("zip", DataTypes.StringType, true));
    ArrayType addressStruct = DataTypes.createArrayType( DataTypes.createStructType(addressFields));

    employeeFields.add(DataTypes.createStructField("addresses", addressStruct, true));
    StructType employeeSchema = DataTypes.createStructType(employeeFields);

    SparkSession sparkSession = SparkSession
            .builder().appName(SaveToCSV.class.getName())
            .master(master).getOrCreate();

    SparkContext context = sparkSession.sparkContext();
    context.setLogLevel("ERROR");

    SQLContext sqlCtx = sparkSession.sqlContext();

    Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);

    Dataset<Employee>  rowDataset = sparkSession.read()
            .option("inferSchema", "false")
            .schema(employeeSchema)
            .json("simple_employees.json").as(employeeEncoder);

    rowDataset.createOrReplaceTempView("employeeView");

    sqlCtx.sql("select * from employeeView").show();

    sparkSession.close();

}