我有一个数据框,并且正在进行转换,转换变得巨大且占用大量空间。我需要优化空间并优化DAG。
下面是原始数据框
from pyspark.sql.functions import rand,when
df = sqlContext.createDataFrame([
(1000, 11, 720, 34),
(1231, 35, 324, 19),
(2221, 4, 3481, 28),
(2334, 7, 3580, 324),
(4124, 19, 384, 1),
(5002, 12, 302, 21),
(2793, 921, 28, 2),
(8403, 102, 2, 57),
(7263, 20, 875, 675),
(6253, 452, 6, 76)
], ["id","val1_1","val2_1","val3_1"])
df = df.withColumn('val1_2', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val1_3', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val1_4', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val2_2', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val2_3', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val2_4', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val3_2', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val3_3', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val3_4', when(rand() > 0.5, 1).otherwise(0))
df.show(10)
+----+------+------+------+------+------+------+------+------+------+------+------+------+
| id|val1_1|val2_1|val3_1|val1_2|val1_3|val1_4|val2_2|val2_3|val2_4|val3_2|val3_3|val3_4|
+----+------+------+------+------+------+------+------+------+------+------+------+------+
|1000| 11| 720| 34| 0| 1| 1| 1| 1| 0| 1| 1| 0|
|1231| 35| 324| 19| 1| 1| 1| 1| 1| 0| 0| 1| 1|
|2221| 4| 3481| 28| 1| 0| 0| 0| 1| 0| 1| 0| 1|
|2334| 7| 3580| 324| 1| 1| 0| 0| 0| 1| 0| 0| 0|
|4124| 19| 384| 1| 1| 0| 0| 0| 1| 1| 0| 1| 0|
|5002| 12| 302| 21| 0| 0| 0| 0| 0| 0| 0| 1| 1|
|2793| 921| 28| 2| 1| 0| 0| 0| 1| 1| 1| 1| 0|
|8403| 102| 2| 57| 0| 1| 1| 0| 1| 0| 0| 0| 0|
|7263| 20| 875| 675| 0| 1| 1| 0| 1| 1| 1| 0| 1|
|6253| 452| 6| 76| 1| 0| 1| 0| 1| 1| 0| 0| 1|
+----+------+------+------+------+------+------+------+------+------+------+------+------+
下面的代码占用大量空间,需要优化。每个x代表一个转换。但是当我执行一个动作时,它无法获取足够的内存
cols = ['val1_','val2_','val3_']
i = 0
# df_df = {}
for col in cols:
print(col)
columns = []
for val in range(1,5,1):
columns.append(col.lower()+str(val))
temp_df = df.select([column for column in columns])
n = lit(len(temp_df.columns)-1.0)
rowMean = (reduce(add,(column(x) for x in temp_df.columns[0:])) / n).alias(col[:-1]+"mean")
rowMax = (reduce(greatest,(column(x) for x in temp_df.columns[0:]))).alias(col[:-1]+"max")
rowSum = (reduce(add,(column(x) for x in temp_df.columns[0:]))).alias(col[:-1]+"Sum")
rowSum_0_12 = (reduce(add,(column(x) for x in temp_df.columns[0:2]))).alias(col[:-1]+"Sum_0_12")
rowSum_12_24 = (reduce(add,(column(x) for x in temp_df.columns[2:4]))).alias(col[:-1]+"Sum_12_24")
# Calculating Row wise mean
x1 = temp_df.select(rowMean).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
print("x1 completed")
# data_df = data.withColumn
# Calculating Row wise Max value
x2 = temp_df.select(rowMax).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
print("x2 completed")
df_df = x1.join(x2,x1.colIndex==x2.colIndex,'inner').drop(x2.colIndex)
# print(data_df.show(1))
# Calculating Row wise Sum value
x3 = temp_df.select(rowSum).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
print("x3 completed")
df_df = df_df.join(x3,df_df.colIndex==x3.colIndex,'inner').drop(x3.colIndex)
# print(data_df.show(1))
# Calculating Row wise sum value between 0 and 1 column
x4 = temp_df.select(rowSum_0_12).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
print("x4 completed")
df_df = df_df.join(x4,df_df.colIndex==x4.colIndex,'inner').drop(x4.colIndex)
# print(data_df.show(1))
# Calculating Row wise sum value between 2 and 3 column
x5 = temp_df.select(rowSum_12_24).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
print("x5 completed")
df_df = df_df.join(x5,df_df.colIndex==x5.colIndex,'inner').drop(x5.colIndex)
# print(data_df.show(1))
# x4 = x4.withColumn('colIndex',row_number().row_number().over(Window.orderBy(monotonically_increasing_id())))
# x5 = x5.withColumn('colIndex',row_number().row_number().over(Window.orderBy(monotonically_increasing_id())))
# Calculating Row wise sum value between x4-x5/x5
x6_temp = x4.join(x5,'colIndex').withColumn(col[:-1]+'sub_0_12',x4[col[:-1]+'Sum_0_12']-x5[col[:-1]+'Sum_12_24']).select(['colIndex',col[:-1]+'sub_0_12'])
x6 = x6_temp.join(x5,'colIndex').withColumn(col[:-1]+'Avg_0_12',x6_temp[col[:-1]+'sub_0_12']/x5[col[:-1]+'Sum_12_24']).select(['colIndex',col[:-1]+'Avg_0_12'])
print("x6 completed")
df_df = df_df.join(x6,df_df.colIndex==x6.colIndex,'inner').drop(x6.colIndex)
# print(data_df.show(1))
rowSum_0_6 = (reduce(add,(column(x) for x in [temp_df.columns[0],temp_df.columns[2]]))).alias(col[:-1]+"Sum_0_6")
rowSum_6_12 = (reduce(add,(column(x) for x in [temp_df.columns[1],temp_df.columns[3]]))).alias(col[:-1]+"Sum_6_12")
# Calculating Row wise sum value 0 and 2
x7 = temp_df.select(rowSum_0_6).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
print("x7 completed")
df_df = df_df.join(x7,df_df.colIndex==x7.colIndex,'inner').drop(x7.colIndex)
# print(data_df.show(1))
# Calculating Row wise sum value 1 and 3
x8 = temp_df.select(rowSum_6_12).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
print("x8 completed")
df_df = df_df.join(x8,df_df.colIndex==x8.colIndex,'inner').drop(x8.colIndex)
if i == 0:
data_final = df_df
else:
data_final = data_final.join(df_df,data_final.colIndex==df_df.colIndex,'inner').drop(df_df.colIndex)
i+= 1
data_final.show(2)