我有一段很长的代码。任何人都可以提出如何将其缩小的想法。
我试图通过for循环来实现它,将分割数作为长度并调用函数但不确定如何通过每个分割。
def triggerexec(df_count,event_df):
if df_count <= 10000:
print("Input partioned into 2 splits.")
df_splits = event_df.randomSplit(([1.0,1.0]))
if df_splits[0].rdd.isEmpty():
print("No data in 1st split")
else:
print ("Input count for 1st split: " + str(df_splits[0].count()))
extract_and_push(df_splits[0])
if df_splits[1].rdd.isEmpty():
print("No data in 2nd split")
else:
print ("Input count for 2nd split: " + str(df_splits[1].count()))
extract_and_push(df_splits[1])
elif 10001 <= df_count <= 50000:
print ("Input partioned into 4 splits.")
df_splits = event_df.randomSplit(([1.0, 1.0, 1.0, 1.0]))
if df_splits[0].rdd.isEmpty():
print("No data in 1st split")
else:
print ("Processing 1st split")# + str(df_splits[0].count()))
extract_and_push(df_splits[0])
if df_splits[1].rdd.isEmpty():
print("No data in 2nd split")
else:
print ("Processing 2nd split ")# + str(df_splits[1].count()))
extract_and_push(df_splits[1])
if df_splits[2].rdd.isEmpty():
print("No data in 3rd split")
else:
print ("Processing 3rd split")# + str(df_splits[2].count()))
extract_and_push(df_splits[2])
if df_splits[3].rdd.isEmpty():
print("No data in 4th split")
else:
print ("Processing 4th split") # + str(df_splits[3].count()))
extract_and_push(df_splits[3])
答案 0 :(得分:2)
最简单的解决方案是将重复任务带到另一个功能并执行它。我为N分割设置了这个函数,所以不需要在不同的if
内调用它。
def splits(df_splits):
for i, split in enumerate(df_splits):
if split.rdd.isEmpty():
print("No data in split number " + str(i+1))
else:
print ("Input count for split number " + str(i+1) + ": " + str(split.count()))
extract_and_push(split)
def triggerexec(df_count,event_df):
if df_count <= 10000:
print("Input partioned into 2 splits.")
df_splits = event_df.randomSplit(([1.0,1.0]))
elif df_count <= 50000:
print ("Input partioned into 4 splits.")
df_splits = event_df.randomSplit(([1.0, 1.0, 1.0, 1.0]))
else:
# Either do something in every case posible, return, or throw an error, but make sure that the splits(df_splits) is not called id df_splits is not defined
return
splits(df_splits)
一个更复杂的解决方案会接受另一个参数作为参数并将其拆分为outter函数:
def split_this(event_df, n):
print("Imput partitioned into " + str(n) + " splits.")
df_splits = event_df.randomSplit(([1.0]*n))
for i, split in enumerate(df_splits):
if split.rdd.isEmpty():
print("No data in split number " + str(i+1))
else:
print ("Input count for split number " + str(i+1) + ": " + str(split.count()))
extract_and_push(split)
def triggerexec(df_count,event_df):
if df_count <= 10000:
splits = 2
elif df_count <= 50000:
splits = 4
else:
# Default case:
splits = 10
split_this(event_df, splits)
归功于@Lex的枚举部分和默认情况。
答案 1 :(得分:0)
Python 3版本:
def triggerexec(df_count,event_df):
if df_count <= 10000:
print("Input partioned into 2 splits.")
df_splits = event_df.randomSplit(([1.0,1.0]))
elif df_count <= 50000:
print("Input partioned into 4 splits.")
df_splits = event_df.randomSplit(([1.0, 1.0, 1.0, 1.0]))
else:
df_splits = []
for i, split in enumerate(df_splits):
if split.rdd.isEmpty():
print("No Data in split Nr. {:d}".format(i))
else:
print("Input count for split Nr. {:d}: {:s}".format(i, split.count()))
extract_and_push(split)
这是使用enumerate作为索引的一个函数,这是IMO更加迷人的方式