我尝试通过增加spark.driver.memory来增加内存,但是仍然出现内存不足错误。 我需要去垃圾收集吗?
我一次又一次调用这些函数来进行转换,但是不断得到:
PySpark:java.lang.OutofMemoryError:Java堆空间
我已经检查了旧问题,但是这些问题没有帮助。
我没有日志,但是听到的是导致问题的代码。
#4th task
def Etl_job4(new_df):
for n in new_df.columns:
newdf=new_df.withColumn(n,coalesce(concat(lit(n+' : '),lit(f.col(n))),concat(lit(n+' : '),lit(""))))
print("4 end")
return new_df
#4th task
def Etl_job5(new_df):
for n in new_df.columns:
newdf=new_df.withColumn(n,coalesce(concat(lit(n+' = '),lit(f.col(n))),concat(lit(n+' = '),lit(""))))
print("5 end")
return new_df
#3rd task
def Etl_job3(new_df):
schema=StructType([StructField('Contest_Title',StringType(),True),StructField('Party',StringType(),True),StructField('Candidate',StringType(),True),StructField('State',StringType(),True),StructField('Year',IntegerType(),True),StructField('County',StringType(),True),StructField('Votes',IntegerType(),True),StructField('Type',StringType(),False),StructField('Precinct',StringType(),False)])
df3 = sqlContext.createDataFrame(sc.emptyRDD(),schema)
for n in [x for x in new_df.columns if x not in ['Contest_Title','Party','Candidate','State','Year','County']]:
#print(n)
rex_df=new_df.select('Contest_Title','Party','Candidate','State','Year','County',n)
if n in ['ABSENTEE','PROVISIONAL']:
rex_df=rex_df.withColumn("Type",lit(n))
rex_df=rex_df.withColumn("Precinct",lit(None))
else:
rex_df=rex_df.withColumn("Type",lit('PRECINCT'))
rex_df=rex_df.withColumn("Precinct",lit(n))
rex_df=rex_df.withColumnRenamed(n,'Votes')
df3=df3.union(rex_df)
print("3 end")
return df3
def Etl_job2(inpath,outpath):
file_location = inpath
f_name=os.path.basename(file_location)
fname_list=f_name.split('_')
print("###############",f_name)
mid=fname_list[0]+'_'+fname_list[1].split('-')[0]+'_'+fname_list[2].split('-')[-1]
df = sqlContext.read.format("csv").option("inferSchema",True).option("header",True).option("delimiter","|").load(file_location)
##Get count of unique title and make a list
temp=df.groupBy('Contest_Title').count()
c_title = [str(row['Contest_Title']) for row in temp.collect()]
print("1 end")
##Generate unique file for each title
for title in c_title:
print("\ntitle==>",title)
new_df=df.where(df['Contest_Title']==title)
fname_list[1]=fname_list[1][:4]+'_'+title+fname_list[1][4:]
new_name=('_'.join("_".join( x.split() ) for x in fname_list)).strip()
new_name=new_name.replace("-","_").replace(",","_").replace(" ","_").replace("__","_").replace("__","_")
new_df=Etl_job3(new_df)
if not os.path.exists("./O2/"+outpath[26:]+mid+'/'):
os.makedirs("./O2/"+outpath[26:]+mid+'/')
#print("No of rows ==>",new_df.count(),"path==> /"+mid+'/'+new_name)
new_df.toPandas().to_csv("./O2/"+outpath[26:]+mid+'/'+new_name, header=True, sep='|', index = False)
print("3 end wright")
new_df1=Etl_job4(new_df)
if not os.path.exists("./O21/"+outpath[26:]+mid+'/'):
os.makedirs("./O21/"+outpath[26:]+mid+'/')
#print("No of rows ==>",new_df.count(),"path==> /"+mid+'/'+new_name)
new_df1.toPandas().to_csv("./O21/"+outpath[26:]+mid+'/'+new_name, header=True, sep='|', index = False)
print("4 end wright")
new_df1=Etl_job5(new_df)
if not os.path.exists("./O22/"+outpath[26:]+mid+'/'):
os.makedirs("./O22/"+outpath[26:]+mid+'/')
#print("No of rows ==>",new_df.count(),"path==> /"+mid+'/'+new_name)
new_df1.toPandas().to_csv("./O22/"+outpath[26:]+mid+'/'+new_name, header=True, sep='|', index = False)
print("5 end wright ")
fname_list=os.path.basename(file_location).split('_')
答案 0 :(得分:1)
创建系统变量:
_JAVA_OPTIONS=-Xmx4G -Xms3G.
默认情况下,此变量可以设置为较小的值。如果您有8GB的内存,那么应该可以解决您的问题。