我在独立群集模式下运行spark作业时遇到此错误。
我在pyspark
中有以下代码:
def join_client_info(self, cur_df):
raw_clrr_df = sqlContext.read.parquet(hdfs_path+'/data/life400/CLRRPF')\
.selectExpr(['CLNTNUM as cliNum',\
'CLRRROLE'])
salh_df = sqlContext.read.parquet(hdfs_path+'/data/life400/SALHPF')\
.selectExpr(['CLNTNUM as cliNum',\
'DECL_GR_SALARY as proposerSalary'])
spaceDeleteUDF = udf(lambda s: re.sub('[^A-Za-z0-9]+', "", s), StringType())
clrr_df = raw_clrr_df.withColumn('clientRole', spaceDeleteUDF(\
raw_clrr_df['CLRRROLE'])).drop('CLRRROLE')
cli_num = cur_df.select(['cliNum']).collect()[0]['cliNum']
number_of_pols_lf = clrr_df.filter('cliNum='+cli_num)\
.where(clrr_df['clientRole']=='LF')\
.count()
number_of_pols_ow = clrr_df.filter('cliNum='+cli_num)\
.where(clrr_df['clientRole']=='OW')\
.count()
with_lf_num_of_policies = cur_df.withColumn('numberOfPolsIn'\
,lit(number_of_pols_lf))
with_lf_ow_num_of_policies = with_lf_num_of_policies.withColumn(\
'numberOfPolsOw'\
,lit(number_of_pols_ow))
# print(cur_df)
with_proposer_sal = salh_df.filter('cliNum='+cli_num)
return with_lf_ow_num_of_policies.join(with_proposer_sal,'cliNum','inner')
如果我取消注释"print(cur_df)"
行,它可以正常工作,不会给我一个错误。我发现这种行为很奇怪。我在这里错过了什么?