我正在编写一个代码,该代码根据提取的时间戳多次遍历数据帧。这里的问题是,每当外部循环运行时,就不会在火花中释放内存。收集数据框后,只有100条记录。经过四个外部循环后,我的内存超过了10G。我是新来的。
infected = [3, 6, 127]
import gc
print("Grouping The value by timestamp")
all_value = {}
for level, value in enumerate(data_2020.select('timestamp').distinct().collect()):
print(level, value)
filter_condition = 'timestamp="{}"'.format(value.timestamp)
sample_data = data_2020.filter(filter_condition).select('latitude', 'longitude', 'car_id')
infected_df = sample_data.filter(sample_data.car_id.isin(*infected)).collect()
sample_data = sample_data.filter(~sample_data.car_id.isin(*infected))
sample_data = sample_data.withColumn("min_distance", lit(np.inf)).withColumn("infector", lit(np.inf))
for base_df in infected_df:
sample_data = sample_data.withColumn('current_infector',
lit(base_df.car_id)).withColumn('base_latitude',
lit(base_df.latitude)).withColumn('base_longitude',
lit(base_df.longitude)).withColumn('base_latitude',
f.round('base_latitude', 6)).withColumn('base_longitude',
f.round('base_longitude', 6)).withColumn('distance',
haversine_custom('latitude', 'longitude', 'base_latitude',
'base_longitude')).withColumn('calculation',
find_min_udf('min_distance', 'distance', 'infector',
'current_infector')).withColumn('threshold',
threshold_udf('min_distance')).select('latitude',
'longitude', 'car_id', 'calculation.min_distance',
'calculation.infector')
all_value[value.timestamp] = sample_data.withColumn("threshold", threshold_udf("min_distance")).filter("threshold=1").collect()
sample_data.unpersist()
sample_data = None
del sample_data
spark.catalog.clearCache()
infected = list(set([i.car_id for i in all_value[value.timestamp]] + infected))
print("Level: {} \n Infected: {}".format(level, infected))
gc.collect()