尝试使用此功能将Spark数据帧导出为CSV:
def save_to_csv(df, filepath, append_header=False):
columns = [c for c in df.columns]
len_columns = len(df.columns)
# Create single column of comma-separated fields (no brackets - data only), replacing null values with ''
rdd_data = df.rdd.map(lambda row: ','.join([str(row[i]) if row[i] != None else '' for i in range(len_columns)]))
rdd_all = None
if append_header:
# Create header column of comma-separated field names
header = ','.join([c for c in columns])
rdd_header = sc.parallelize([header])
rdd_header = rdd_header.zipWithIndex()
rdd_data = rdd_data.zipWithIndex().map(lambda row: (row[0], 1+row[1]))
rdd_all = rdd_header.union(rdd_data)
rdd_all = rdd_all.sortBy(lambda row : row[1])
rdd_all = rdd_all.map(lambda row : row[0])
else:
rdd_all = rdd_data
rdd_all.saveAsTextFile(filepath, 'org.apache.hadoop.io.compress.GzipCodec')
以前可以正常使用它,但是由于“ Unicode错误”而导致当前失败。我相信我需要将编码转换为'utf-8',但不确定在函数中的哪个位置应用