Question

我尝试通过增加spark.driver.memory来增加内存，但是仍然出现内存不足错误。我需要去垃圾收集吗？

我一次又一次调用这些函数来进行转换，但是不断得到：

PySpark：java.lang.OutofMemoryError：Java堆空间

我已经检查了旧问题，但是这些问题没有帮助。

我没有日志，但是听到的是导致问题的代码。

#4th task
def Etl_job4(new_df):
    for n in new_df.columns:
        newdf=new_df.withColumn(n,coalesce(concat(lit(n+' : '),lit(f.col(n))),concat(lit(n+' : '),lit(""))))
    print("4 end")
    return new_df

#4th task
def Etl_job5(new_df):
    for n in new_df.columns:
        newdf=new_df.withColumn(n,coalesce(concat(lit(n+' = '),lit(f.col(n))),concat(lit(n+' = '),lit(""))))
    print("5 end")
    return new_df

#3rd task
def Etl_job3(new_df):
    schema=StructType([StructField('Contest_Title',StringType(),True),StructField('Party',StringType(),True),StructField('Candidate',StringType(),True),StructField('State',StringType(),True),StructField('Year',IntegerType(),True),StructField('County',StringType(),True),StructField('Votes',IntegerType(),True),StructField('Type',StringType(),False),StructField('Precinct',StringType(),False)])
    df3 = sqlContext.createDataFrame(sc.emptyRDD(),schema)
    for n in [x for x in new_df.columns if x not in ['Contest_Title','Party','Candidate','State','Year','County']]:
        #print(n)
        rex_df=new_df.select('Contest_Title','Party','Candidate','State','Year','County',n)
        if n in ['ABSENTEE','PROVISIONAL']:
            rex_df=rex_df.withColumn("Type",lit(n))
            rex_df=rex_df.withColumn("Precinct",lit(None))
        else:
            rex_df=rex_df.withColumn("Type",lit('PRECINCT'))
            rex_df=rex_df.withColumn("Precinct",lit(n))
        rex_df=rex_df.withColumnRenamed(n,'Votes')
        df3=df3.union(rex_df)
    print("3 end")
    return df3

def Etl_job2(inpath,outpath):
    file_location = inpath
    f_name=os.path.basename(file_location)
    fname_list=f_name.split('_')
    print("###############",f_name)
    mid=fname_list[0]+'_'+fname_list[1].split('-')[0]+'_'+fname_list[2].split('-')[-1]
    df = sqlContext.read.format("csv").option("inferSchema",True).option("header",True).option("delimiter","|").load(file_location)

    ##Get count of unique title and make a list
    temp=df.groupBy('Contest_Title').count()
    c_title = [str(row['Contest_Title']) for row in temp.collect()]
    print("1 end")
    ##Generate unique file for each title
    for title in c_title:
        print("\ntitle==>",title)
        new_df=df.where(df['Contest_Title']==title)
        fname_list[1]=fname_list[1][:4]+'_'+title+fname_list[1][4:]
        new_name=('_'.join("_".join( x.split() ) for x in fname_list)).strip()
        new_name=new_name.replace("-","_").replace(",","_").replace(" ","_").replace("__","_").replace("__","_")


        new_df=Etl_job3(new_df)
        if not os.path.exists("./O2/"+outpath[26:]+mid+'/'):
            os.makedirs("./O2/"+outpath[26:]+mid+'/')
        #print("No of rows ==>",new_df.count(),"path==> /"+mid+'/'+new_name)
        new_df.toPandas().to_csv("./O2/"+outpath[26:]+mid+'/'+new_name, header=True, sep='|', index = False)
        print("3 end wright")

        new_df1=Etl_job4(new_df)
        if not os.path.exists("./O21/"+outpath[26:]+mid+'/'):
            os.makedirs("./O21/"+outpath[26:]+mid+'/')
        #print("No of rows ==>",new_df.count(),"path==> /"+mid+'/'+new_name)
        new_df1.toPandas().to_csv("./O21/"+outpath[26:]+mid+'/'+new_name, header=True, sep='|', index = False)
        print("4 end wright")

        new_df1=Etl_job5(new_df)
        if not os.path.exists("./O22/"+outpath[26:]+mid+'/'):
            os.makedirs("./O22/"+outpath[26:]+mid+'/')
        #print("No of rows ==>",new_df.count(),"path==> /"+mid+'/'+new_name)
        new_df1.toPandas().to_csv("./O22/"+outpath[26:]+mid+'/'+new_name, header=True, sep='|', index = False)
        print("5 end wright ")

        fname_list=os.path.basename(file_location).split('_')

Answer 1

创建系统变量：

_JAVA_OPTIONS=-Xmx4G -Xms3G.

默认情况下，此变量可以设置为较小的值。如果您有8GB的内存，那么应该可以解决您的问题。

即使增加了spark.driver.memory，PySpark仍会耗尽Java堆空间

1 个答案: