尝试使用PySpark显示我的结果数据框,并收到上述错误。
from pyspark.sql.functions import col, count, countDistinct, when, trim, isnull, concat_ws, concat, lit, substring, round, avg, max, min, length, lit, udf
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
# Get stats for all columns in a table
def get_table_stats(table, columns):
udf_to_string = udf(lambda x: str(x), StringType())
# Populate dataframes
table_df = sqlContext.sql('SELECT ' + ','.join(columns) + ' FROM ' + table)
# New df columns
table_headers = ['_column', \
'_max', \
'_min', \
'_avg', \
'_max_length', \
'_min_length', \
'_avg_length']
# Cycle through each column, obtain main stats and put in to dataframe
for index, column in enumerate(table_df.columns):
# Selecting relevant column value and also the length of those values.
# The values must first be converted to strings to obtain the lengths, hence the use of the to_string UDF
length_alias = column + '_lengths'
col_df = table_df.select(column, length(udf_to_string(col(column))).alias(length_alias))
# Relevant aggregates are determined, given aliases and stored as a single row dataframe
aggs_df = col_df.agg(max(column).alias(table_headers[1]), \
min(column).alias(table_headers[2]), \
avg(column).alias(table_headers[3]), \
max(length_alias).alias(table_headers[4]), \
min(length_alias).alias(table_headers[5]), \
avg(length_alias).alias(table_headers[6]))
# Add the column name as a column in our results row dataframe
temp_raw_df = aggs_df.withColumn(table_headers[0], lit(column))
# As we want to concatenate each row of column results to return a full dataframe of results, we must
# ensure all values are of the same type, thus convert every value to a string
temp_df = temp_raw_df.select([temp_raw_df['_column']] + [temp_raw_df[col_to_cast].cast(StringType()) for col_to_cast in temp_raw_df.columns[:-1]])
# Update master_df after each column results are aggregated
if index == 0:
master_df = temp_df
else:
master_df = master_df.union(temp_df)
return master_df
定义此代码并运行以下代码会给出如下所示的错误。
>>> mydf = get_table_stats(table, ['index', 'name', 'age'])
>>> mydf.show()
UnicodeEncodeError: 'latin-1' codec can't encode characters in position 515-517: ordinal not in range(256)
任何人都知道Spark本身有问题吗?或者这是一个python问题?
我已经检查了我的iPython控制台中使用的编码,它是' UTF-8'。