我正在尝试使用多重处理将功能的结果(多个熊猫数据框)导出到同一excel文件中的多个工作表。为此,我提出了以下建议。
test_multiprocessing_3.py
def aggregator(d):
df = d[0]
writer = d[1]
col1 = d[2]
col2 = d[3]
agg_df = df.groupby([col1, col2]).apply(lambda x: 100 * (x['enr'].sum() / len(x))).reset_index(name='ontime_en_rate')
sheet_name = str(col1 + "_" + col2 + "_ontime_en")
agg_df.to_excel(writer, sheet_name, index=False)
test_multiprocessing_4.py
from multiprocessing import Pool
from pandas import ExcelWriter
import pandas as pd
import test_multiprocessing_3
if __name__ == '__main__':
writer = ExcelWriter('D:/Exports/user_funct_agg_test9.xlsx')
df_main = pd.read_excel('D:/Exports/test_data.xlsx')
dimensions = [(df_main, writer,'member', 'type'), (df_main, writer,'member', 'month')]
with Pool(processes=3) as pool:
pool.map(test_multiprocessing_3.aggregator, dimensions)
writer.save()
但是出现以下错误:
TypeError: __new__() missing 1 required positional argument: 'path'
此外,我还在Windows 10计算机上运行Python v3.7.5。我也尝试过保留该函数并在同一文件中调用,但出现相同的错误。
test_multiprocessing_5.py
import pandas as pd
#import numpy as np
from multiprocessing import Pool
from pandas import ExcelWriter
def aggregator(data_chunks):
df = data_chunks[0]
file_name = data_chunks[1]
col1 = data_chunks[2]
col2 = data_chunks[3]
agg_df = df.groupby([col1,col2]).apply(lambda x: 100*(x['enr'].sum()/len(x))).reset_index(name='ontime_en_rate')
sheet_name = str(col1 + "_" + col2 + "_ontime_en")
agg_df.to_excel(file_name, sheet_name, index=False)
if __name__ == '__main__':
writer = ExcelWriter('D:/Exports/user_funct_agg_test9.xlsx')
df_main = pd.read_excel('D:/Exports/test_data.xlsx')
dimensions = [(df_main, writer,'member', 'type'), (df_main, writer,'member', 'month')]
with Pool(processes=3) as pool:
pool.map(aggregator, dimensions)
writer.save()
我对哪一步出错有帮助吗?
编辑:感谢@stovfl在https://stackoverflow.com/a/43101031/7414759
中的回答更新的工作代码:
import pandas as pd
import multiprocessing as mp
from pandas import ExcelWriter
def aggregator(df, col1, col2):
agg_df = df.groupby([col1, col2]).apply(lambda x: 100 * (x['enr'].sum() / len(x))).reset_index(name='ontime_en_rate')
return agg_df
if __name__ == '__main__':
pool = mp.Pool(processes=3)
df_main = pd.read_excel('D:/Exports/test_data.xlsx')
dimensions = [(df_main, 'member', 'type'), (df_main, 'member', 'month')]
results = pool.starmap(aggregator, dimensions)
writer = ExcelWriter('D:/Exports/user_funct_agg_test9.xlsx')
for k, result in enumerate(results):
sheet_name = 'ontime_'+str(k)
result.to_excel(writer, sheet_name=sheet_name,index=False)
writer.save()
pool.close()