我正在使用pyspark(python)使用以下代码从包含700,000行的csv文件读取数据到spark DataFrame。当我显示记录时,我只能看到2759行。 DataFrame是否具有固定大小,还是我在这里丢失了某些东西?
# create spark session
spark = SparkSession.builder.appName("DailyItemSales").getOrCreate()
# get the raw data (RDD)
lines = spark.read\
.format("csv")\
.option("header", "true")\
.load(filename)\
.rdd
def parse_input(self, line):
return Row(
dish_name=line[1],
section=line[3],
family=line[4],
subfamily=line[5],
sale_date=line[6],
quantity=self.get_value(line[7]),
amount=self.get_value(line[8])
)
def get_value(self, value):
if value is None:
return 0.0
else:
val = [float(x) for x in value.strip().split()]
return next(iter(val))
# convert it to RDD of Row objects
sales = lines.map(self.parse_input)
# create dataframe
df = spark.createDataFrame(sales)
# display records (showing only 2759 records)
i = 1
for x in df.collect():
print(i, x.dish_name, x.amount, x.family, x.quantity)
i+=1