我必须使用databricks中的spark dataframe将json文件转换为csv文件。我已经尝试过下面的代码将json转换为csv,但是我得到的CSV数据源不支持spark dataframe中的数组数据类型。我无法转换为csv文件。有人可以在此问题上帮助我如何删除_corrupt_string吗?
import json
data=r'/dbfs/FileStore/tables/ABC.json'
print ("This is json data ", data)
def js_r(data):
with open(data, encoding='utf-8') as f_in:
return(json.load(f_in))
if __name__ == "__main__":
dic_data_first = js_r(data)
print("This is my dictionary", dic_data_first)
keys= dic_data_first.keys()
print ("The original dict keys",keys)
dic_data_second={'my_items':dic_data_first['Data']for key in keys}
with open('/dbfs/FileStore/tables/ABC_1.json', 'w') as f:
json.dump(dic_data_first, f)
df = sqlContext.read.json('dbfs:/FileStore/tables/ABC_1.json') # reading a json and writing a parquet
print(df)
df.write.mode("overwrite").format("com.databricks.spark.csv").option("header","true").csv("/dbfs/FileStore/tables/ABC_1.csv")
JSON data as follows:
{"Table":"test1",
"Data":[
{"aa":"1",
"bb":"2"},
{"aa" :"ss",
"bb":"dc"}
}]
}