将我的csv转换为pyspark中的数据框时出错。
read_rdd = sc.textFile("path to my container/myfile.csv")
intermediate_rdd = read_rdd.mapPartitions(lambda x: csv.reader(x, delimiter=","))
header=intermediate_rdd.first()
data_1 = intermediate_rdd.filter(lambda row : row != header).toDF(header)
data_1.show(5)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u2026' in position 115: ordinal not in range(128)
答案 0 :(得分:0)
import csv
from pyspark.sql.types import Row
read_rdd = sc.textFile("path/to/file")
intermediate_rdd = read_rdd.mapPartitions(lambda x: csv.reader(x, delimiter=","))
data = intermediate_rdd.filter(lambda row : row != header).toDF(header)
data.show(20)