我正在尝试读取CSV文件,并使用如下所示的sc.parallelize()将它们与其他CSV文件结合起来。
def unionAll(list_):
return reduce(DataFrame.unionAll, list_)
def concat_by_filenames(file_input_path,file_name_pattern,cols):
temp_list = []
for i in (glob.glob(str(file_input_path + '\\' + file_name_pattern)+'*.csv')):
df = spark_session.read.csv(i,header=True,sep="|")
df =df[cols]
temp_list.append(df)
return unionAll(temp_list)
def joins(parent_path,input_path,pd_chuncked):
csv = ['imms.csv','conditions.csv','preps.csv']
dict_ = {
"conditions":['PD_ID','A','B']}
for i in csv:
file_ = concat_by_filenames(parent_path,i.split('.')[0],dict_[i.split('.')[0]])
data = pd_chuncked.join(file_,on='PD_ID',how='inner')
data = data.na.fill("")
filename = i.split('.')[0]+ ".csv"
data.toPandas().to_csv(input_path + '/' + filename,sep='|',index=False, encoding='latin-1')
def chunks(iterator):
df = pd.DataFrame(list(iterator),index=None, columns = ['PD_ID'])
joins(parent_path, op_path, df)
pd_id_df = concat_by_filenames(parent_path,"pd_id_*", ['PD_ID'])
pd_id_df1 = pd_id_df.toPandas()
pd_ids = pd_id_df1["PD_ID"].to_list()
sc.parallelize(patient_ids,num_partions).foreachPartition(chunks)
但是出现以下错误:
PicklingError:无法序列化对象:异常:似乎您正在尝试从广播变量,操作或转换引用SparkContext。 SparkContext只能在驱动程序上使用,而不能在工作程序上运行的代码中使用。有关更多信息,请参阅SPARK-5063。
有人可以帮我吗?