我有一个迭代算法(pyspark),可以在Spark DataFrame的一部分上进行更新。我通过循环进行此操作,并且在每次迭代时,我的工作变得更加昂贵,并且拥有更长的血统。在迭代i时,我有一些迭代i-1 +
沿袭的步骤(沿袭变得越来越长)。
我尝试了很多选择来打破血统,但这是行不通的。这是我的源代码。我正在使用jupyterLab VM。
def chronologically_compute(myDataFrame, number_of_compute, spark_session):
# UDFs
find_law_to_apply_udf = udf(find_law_to_apply, IntegerType())
compute_loss_udf = udf(compute_loss, FloatType())
TIMING = []
#myDataFrame = myDataFrame.repartition(1000)
spark_session.sparkContext.setCheckpointDir("myDirectory")
#myDataFrame.explain(True)
#myDataFrame.checkpoint()
for i in range(1, number_of_compute + 1):
debutRank = time.time()
print("Itération", i)
myDataFrame = myDataFrame.withColumn("column1",
when(myDataFrame.rank == i, find_law_to_apply_udf("updatedComputed")
).otherwise(myDataFrame.column1))
myDataFrame = myDataFrame.withColumn("SelectedValue",
when(myDataFrame.rank == i, myDataFrame["column2"].getItem(col("column1") - 1)
).otherwise(myDataFrame.SelectedValue))
myDataFrame = myDataFrame.withColumn("computed",
when(myDataFrame.rank == i, compute_loss_udf("SelectedValue", "Time")
).otherwise(myDataFrame.computed))
window = Window.partitionBy('ID')
myDataFrame = myDataFrame.withColumn('computedSum', sum("computed").over(window))
myDataFrame = myDataFrame.withColumn('updatedComputed',
when(myDataFrame.rank == i, myDataFrame.computedSum + myDataFrame.updatedComputed
).otherwise(myDataFrame.updatedComputed))
myDataFrame = myDataFrame.withColumn('updatedComputed',
when(myDataFrame.rank == i + 1, myDataFrame.computedSum + myDataFrame.updatedComputed
).otherwise(myDataFrame.updatedComputed))
if i % 10 == 0:
d = time.time()
myDataFrame.checkpoint()
print(myDataFrame.count())
#myDataFrame.persist(StorageLevel.DISK_ONLY_2)
duree_lineage = time.time() - d
print("Lineage took {0}".format(duree_lineage))
TIMING.append(duree_lineage)
duree = time.time() - debutRank
print("Modif took {0}".format(duree))
print("Iteration time sum", np.sum(TIMING))
print("Iteration time avg", np.mean(TIMING))
return myDataFrame
def main(spark_session):
try:
spark_jobs(spark_session)
except Exception as ex:
print(traceback.format_exc())
raise
if __name__ == "__main__":
SPARK_SESSION = SparkSession \
.builder \
.appName("AppName") \
.enableHiveSupport() \
.config('spark.executor.memory','2g') \
.config('spark.driver.memory','2g') \
.config('spark.driver.maxResultsSize','2g') \
.config("spark.logLineage", "true") \
.config("spark.executor.extraJavaOptions","-Xss32M") \
.getOrCreate()
main(SPARK_SESSION)
SPARK_SESSION.stop()