Question

我有一个数据框，并且正在进行转换，转换变得巨大且占用大量空间。我需要优化空间并优化DAG。

下面是原始数据框

from pyspark.sql.functions import rand,when
df = sqlContext.createDataFrame([
    (1000, 11, 720, 34),
    (1231, 35, 324, 19),
    (2221, 4, 3481, 28),
    (2334, 7, 3580, 324),
    (4124, 19, 384, 1),
    (5002, 12, 302, 21),
    (2793, 921, 28, 2),
    (8403, 102, 2, 57),
    (7263, 20, 875, 675),
    (6253, 452, 6, 76)
], ["id","val1_1","val2_1","val3_1"])

df = df.withColumn('val1_2', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val1_3', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val1_4', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val2_2', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val2_3', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val2_4', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val3_2', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val3_3', when(rand() > 0.5, 1).otherwise(0))
df = df.withColumn('val3_4', when(rand() > 0.5, 1).otherwise(0))


df.show(10)
+----+------+------+------+------+------+------+------+------+------+------+------+------+
|  id|val1_1|val2_1|val3_1|val1_2|val1_3|val1_4|val2_2|val2_3|val2_4|val3_2|val3_3|val3_4|
+----+------+------+------+------+------+------+------+------+------+------+------+------+
|1000|    11|   720|    34|     0|     1|     1|     1|     1|     0|     1|     1|     0|
|1231|    35|   324|    19|     1|     1|     1|     1|     1|     0|     0|     1|     1|
|2221|     4|  3481|    28|     1|     0|     0|     0|     1|     0|     1|     0|     1|
|2334|     7|  3580|   324|     1|     1|     0|     0|     0|     1|     0|     0|     0|
|4124|    19|   384|     1|     1|     0|     0|     0|     1|     1|     0|     1|     0|
|5002|    12|   302|    21|     0|     0|     0|     0|     0|     0|     0|     1|     1|
|2793|   921|    28|     2|     1|     0|     0|     0|     1|     1|     1|     1|     0|
|8403|   102|     2|    57|     0|     1|     1|     0|     1|     0|     0|     0|     0|
|7263|    20|   875|   675|     0|     1|     1|     0|     1|     1|     1|     0|     1|
|6253|   452|     6|    76|     1|     0|     1|     0|     1|     1|     0|     0|     1|
+----+------+------+------+------+------+------+------+------+------+------+------+------+

下面的代码占用大量空间，需要优化。每个x代表一个转换。但是当我执行一个动作时，它无法获取足够的内存

cols = ['val1_','val2_','val3_']
i = 0
#   df_df = {}
for col in cols:
    print(col)
    columns = []
    for val in range(1,5,1):
        columns.append(col.lower()+str(val))
    temp_df = df.select([column for column in columns])
    n = lit(len(temp_df.columns)-1.0)
    rowMean = (reduce(add,(column(x) for x in temp_df.columns[0:])) / n).alias(col[:-1]+"mean")
    rowMax = (reduce(greatest,(column(x) for x in temp_df.columns[0:]))).alias(col[:-1]+"max")
    rowSum = (reduce(add,(column(x) for x in temp_df.columns[0:]))).alias(col[:-1]+"Sum")
    rowSum_0_12 = (reduce(add,(column(x) for x in temp_df.columns[0:2]))).alias(col[:-1]+"Sum_0_12")
    rowSum_12_24 = (reduce(add,(column(x) for x in temp_df.columns[2:4]))).alias(col[:-1]+"Sum_12_24")
    # Calculating Row wise mean
    x1 = temp_df.select(rowMean).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
    print("x1 completed")
#             data_df = data.withColumn
    # Calculating Row wise Max value
    x2 = temp_df.select(rowMax).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
    print("x2 completed")
    df_df = x1.join(x2,x1.colIndex==x2.colIndex,'inner').drop(x2.colIndex)
#             print(data_df.show(1))
    # Calculating Row wise Sum value
    x3 = temp_df.select(rowSum).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
    print("x3 completed")
    df_df = df_df.join(x3,df_df.colIndex==x3.colIndex,'inner').drop(x3.colIndex)
#             print(data_df.show(1))
    # Calculating Row wise sum value between 0 and 1 column
    x4 = temp_df.select(rowSum_0_12).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
    print("x4 completed")
    df_df = df_df.join(x4,df_df.colIndex==x4.colIndex,'inner').drop(x4.colIndex)
#             print(data_df.show(1))
    # Calculating Row wise sum value between 2 and 3 column
    x5 = temp_df.select(rowSum_12_24).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
    print("x5 completed")
    df_df = df_df.join(x5,df_df.colIndex==x5.colIndex,'inner').drop(x5.colIndex)
#             print(data_df.show(1))
#             x4 = x4.withColumn('colIndex',row_number().row_number().over(Window.orderBy(monotonically_increasing_id())))
#             x5 = x5.withColumn('colIndex',row_number().row_number().over(Window.orderBy(monotonically_increasing_id())))
    # Calculating Row wise sum value between x4-x5/x5
    x6_temp = x4.join(x5,'colIndex').withColumn(col[:-1]+'sub_0_12',x4[col[:-1]+'Sum_0_12']-x5[col[:-1]+'Sum_12_24']).select(['colIndex',col[:-1]+'sub_0_12'])
    x6 = x6_temp.join(x5,'colIndex').withColumn(col[:-1]+'Avg_0_12',x6_temp[col[:-1]+'sub_0_12']/x5[col[:-1]+'Sum_12_24']).select(['colIndex',col[:-1]+'Avg_0_12'])
    print("x6 completed")
    df_df = df_df.join(x6,df_df.colIndex==x6.colIndex,'inner').drop(x6.colIndex)
#             print(data_df.show(1))

    rowSum_0_6 = (reduce(add,(column(x) for x in [temp_df.columns[0],temp_df.columns[2]]))).alias(col[:-1]+"Sum_0_6")
    rowSum_6_12 = (reduce(add,(column(x) for x in [temp_df.columns[1],temp_df.columns[3]]))).alias(col[:-1]+"Sum_6_12")
    # Calculating Row wise sum value 0 and 2
    x7 = temp_df.select(rowSum_0_6).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
    print("x7 completed")
    df_df = df_df.join(x7,df_df.colIndex==x7.colIndex,'inner').drop(x7.colIndex)
#             print(data_df.show(1))
    # Calculating Row wise sum value 1 and 3
    x8 = temp_df.select(rowSum_6_12).withColumn('colIndex',row_number().over(Window.orderBy(monotonically_increasing_id())))
    print("x8 completed")
    df_df = df_df.join(x8,df_df.colIndex==x8.colIndex,'inner').drop(x8.colIndex)
    if i == 0:
        data_final = df_df
    else:
        data_final = data_final.join(df_df,data_final.colIndex==df_df.colIndex,'inner').drop(df_df.colIndex)
    i+= 1

data_final.show(2)

Pyspark-如何优化下面的代码，df_final.show（）占用大量内存

0 个答案: