目标就是准备好我有一个架构,如下所示。如何解析嵌套对象并将其加载到HIVE表中,到目前为止我已经有了这段代码。 运行Spark版本2.2.0.2.6.4.0-91 我需要有关此编码的帮助,如果有人可以请求帮助,我已附上初始代码。
root |-- CustData: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- TimeStamp: double (nullable = true) | | |-- Value_x: double (nullable = true) | | |-- Value_y: double (nullable = true) | | |-- Value_z: double (nullable = true) |-- Cust_ID: string (nullable = true) |-- Deprt_ID: string (nullable = true) |-- EndTime: double (nullable = true) |-- EndTimeZone: string (nullable = true) |-- Salesd: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- Salesd_Value1: long (nullable = true) | | |-- Salesd_Value2: long (nullable = true) | | |-- Salesd_Value3: double (nullable = true) | | |-- Salesd_Value4: double (nullable = true) |-- Cust_RespData: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- TimeStamp: double (nullable = true) | | |-- Cust_RespData_val1: double (nullable = true) | | |-- Cust_RespData_val1: double (nullable = true) | | |-- Cust_RespData_val1: double (nullable = true) | | |-- Cust_RespData_val1: double (nullable = true) |-- Cust_RespData_ID: string (nullable = true)
#!/bin/python2
from pyspark import SparkContext
from pyspark.sql import SparkSession
##from pyspark.sql.functions import get_json_object
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
import json
# define context
sc = SparkContext()
spark = SparkSession(sc)
# load sources
jsonFile = "hdfs://loaclhost/data/cust_salesd.json"
sd = spark.read.json(jsonFile)
sd.printSchema()
sdf = sd.select "CustData","Cust_ID","Deprt_ID","DriverID","EndTime","EndTimeZone",explode(col("Salesd").alias("Salesd_ROW")))
sdf.show()
sdf1 = sdf.select("CustData","Cust_ID","Deprt_ID","DriverID","EndTime","EndTimeZone", "Salesd_ROW.Salesd_Value1", "Salesd_ROW.Salesd_Value2", "Salesd_ROW.Salesd_Value3", "Salesd_ROW.Salesd_Value4")
sdf1.show()
## how can I load it to HIVE table ???
spark.stop()