由于(在DataFrame上)的循环沿袭而导致Stackoverflow错误

时间:2018-11-13 14:36:47

标签: python apache-spark pyspark apache-spark-sql

我有一个迭代算法(pyspark),可以在Spark DataFrame的一部分上进行更新。我通过循环进行此操作,并且在每次迭代时,我的工作变得更加昂贵,并且拥有更长的血统。在迭代i时,我有一些迭代i-1 +沿袭的步骤(沿袭变得越来越长)。 我尝试了很多选择来打破血统,但这是行不通的。这是我的源代码。我正在使用jupyterLab VM。

def chronologically_compute(myDataFrame, number_of_compute, spark_session):
    # UDFs
    find_law_to_apply_udf = udf(find_law_to_apply, IntegerType())
    compute_loss_udf = udf(compute_loss, FloatType())

    TIMING = []

    #myDataFrame = myDataFrame.repartition(1000)
    spark_session.sparkContext.setCheckpointDir("myDirectory")
    #myDataFrame.explain(True)
    #myDataFrame.checkpoint()

    for i in range(1, number_of_compute + 1):

        debutRank = time.time()
        print("Itération", i)
        myDataFrame = myDataFrame.withColumn("column1", 
                                             when(myDataFrame.rank == i, find_law_to_apply_udf("updatedComputed")
                                                 ).otherwise(myDataFrame.column1))
        myDataFrame = myDataFrame.withColumn("SelectedValue", 
                                             when(myDataFrame.rank == i, myDataFrame["column2"].getItem(col("column1") - 1)
                                                 ).otherwise(myDataFrame.SelectedValue))
        myDataFrame = myDataFrame.withColumn("computed", 
                                             when(myDataFrame.rank == i, compute_loss_udf("SelectedValue", "Time")
                                                 ).otherwise(myDataFrame.computed))
        window = Window.partitionBy('ID')
        myDataFrame = myDataFrame.withColumn('computedSum', sum("computed").over(window))
        myDataFrame = myDataFrame.withColumn('updatedComputed', 
                                             when(myDataFrame.rank == i, myDataFrame.computedSum + myDataFrame.updatedComputed
                                                 ).otherwise(myDataFrame.updatedComputed))
        myDataFrame = myDataFrame.withColumn('updatedComputed', 
                                             when(myDataFrame.rank == i + 1, myDataFrame.computedSum + myDataFrame.updatedComputed
                                                 ).otherwise(myDataFrame.updatedComputed))   
        if i % 10 == 0:
            d = time.time()
            myDataFrame.checkpoint()
            print(myDataFrame.count())
            #myDataFrame.persist(StorageLevel.DISK_ONLY_2) 
            duree_lineage = time.time() - d
            print("Lineage took {0}".format(duree_lineage))
            TIMING.append(duree_lineage)

        duree = time.time() - debutRank
        print("Modif took {0}".format(duree))

    print("Iteration time sum", np.sum(TIMING)) 
    print("Iteration time avg", np.mean(TIMING))

return myDataFrame




def main(spark_session):
    try:
        spark_jobs(spark_session)
    except Exception as ex:
        print(traceback.format_exc())
        raise 


if __name__ == "__main__":

SPARK_SESSION = SparkSession \
    .builder \
    .appName("AppName") \
    .enableHiveSupport() \
    .config('spark.executor.memory','2g') \
    .config('spark.driver.memory','2g') \
    .config('spark.driver.maxResultsSize','2g') \
    .config("spark.logLineage", "true") \
    .config("spark.executor.extraJavaOptions","-Xss32M") \
    .getOrCreate()

main(SPARK_SESSION)

SPARK_SESSION.stop()

0 个答案:

没有答案