使用spark 1.2.0
您好,
我想将数据从kafka流保存到镶木地板。 使用jsonRDD创建表时,将模式应用于JSON数据集。 如此处所述https://databricks.com/blog/2015/02/02/an-introduction-to-json-support-in-spark-sql.html
数据来自Kafka,并以嵌套的json 形式出现。
这是一个从文本文件中读取的基本示例,用于说明如何为非嵌套json指定模式。
//contents of json
hdfs@2db12:~$ hadoop fs -cat User/names.json
{"name":"Michael", "age":10}
{"name":"Andy", "age":30}
{"name":"Justin"}
//create RDD from json
scala> val names= sc.textFile("hdfs://10.0.11.8:8020/user/hdfs/User/names.json")
scala> names.collect().foreach(println)
{"name":"Michael", "age":10}
{"name":"Andy", "age":30}
{"name":"Justin"}
// specify schema
val schemaString = "name age gender"
val schema =
StructType(
schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
val peopleSchemaRDD = sqlContext.jsonRDD(names, schema)
scala> peopleSchemaRDD.printSchema()
root
|-- name: string (nullable = true)
|-- age: string (nullable = true)
|-- gender: string (nullable = true)
scala> peopleSchemaRDD.registerTempTable("people")
scala> sqlContext.sql("SELECT name,age,gender FROM people").collect().foreach(println)
[Michael,10,null]
[Andy,30,null]
[Justin,null,null]
是否可以为嵌套的json指定架构? 例如,像这样的json {“filename”:“details”,“attributes”:{“name”:“Michael”,“age”:10}}
非常感谢
答案 0 :(得分:2)
如果您至少有一个带有性别字段的json,则可以使用sqlContext.jsonFile()
。
或详细定义架构
val schema = StructType(
StructField("filename", StringType, true) ::
StructField(
"attributes",
StructType(schemaString.split(" ").map(fieldName =>
StructField(fieldName, StringType, true)
))
) :: Nil
)
答案 1 :(得分:2)
java版本..以下链接帮助我
create nested dataframe programmatically with Spark
public static void main(String[] args) throws AnalysisException {
String master = "local[*]";
List<StructField> employeeFields = new ArrayList<>();
employeeFields.add(DataTypes.createStructField("firstName", DataTypes.StringType, true));
employeeFields.add(DataTypes.createStructField("lastName", DataTypes.StringType, true));
employeeFields.add(DataTypes.createStructField("email", DataTypes.StringType, true));
List<StructField> addressFields = new ArrayList<>();
addressFields.add(DataTypes.createStructField("city", DataTypes.StringType, true));
addressFields.add(DataTypes.createStructField("state", DataTypes.StringType, true));
addressFields.add(DataTypes.createStructField("zip", DataTypes.StringType, true));
ArrayType addressStruct = DataTypes.createArrayType( DataTypes.createStructType(addressFields));
employeeFields.add(DataTypes.createStructField("addresses", addressStruct, true));
StructType employeeSchema = DataTypes.createStructType(employeeFields);
SparkSession sparkSession = SparkSession
.builder().appName(SaveToCSV.class.getName())
.master(master).getOrCreate();
SparkContext context = sparkSession.sparkContext();
context.setLogLevel("ERROR");
SQLContext sqlCtx = sparkSession.sqlContext();
Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);
Dataset<Employee> rowDataset = sparkSession.read()
.option("inferSchema", "false")
.schema(employeeSchema)
.json("simple_employees.json").as(employeeEncoder);
rowDataset.createOrReplaceTempView("employeeView");
sqlCtx.sql("select * from employeeView").show();
sparkSession.close();
}