json [{“ entryLevel”:{“ a”:“ 0”,“ b”:“ 1”,“ c”:“ 3”,d:[],e:[]}}]
输出:-所有数据都在具有数组的单列中 结构 + --------------------------------------------- + | entryLevel | + --------------------------------------------- + | [WrappedArray (),1,2,0,3,WrappedArray()]
Expeded输出:-每个元素都来自不同的列 + --------------------------------------------- + | entryLevel | abcde + --------------------------------------------- + | [WrappedArray (),1,2,0,3,WrappedArray()] 0 1 3 [] []
我已经将Jsonschema定义为json的模式
public static final StructType ObjectSchema = new StructType(new StructField[]{
new StructField("d",new ArrayType(DataTypes.StringType, true), true, Metadata.empty()),
new StructField("a",DataTypes.StringType, true, Metadata.empty()),
new StructField("b",DataTypes.StringType, true, Metadata.empty()),
new StructField("c",DataTypes.StringType, true, Metadata.empty()),
new StructField("e",new ArrayType(DataTypes.StringType, true), true, Metadata.empty()),
});
public static final StructType Jsonschema = new StructType(new StructField[] { new StructField("entryLevel",ObjectSchema, true, Metadata.empty())});
DataFrame parsedjson = sqlContext.read().option("multiLine","true").option("mode", "PERMISSIVE").json(lines);
DataFrame parsedjsoncol= parsedjson.withColumn("data",explode("entryLevel"))
.select("a", "b", "c", "d");