我正在尝试将我的python脚本与多处理库并行化。我的函数是类的一部分,我使用Pool.map
。
import numpy as np
import pandas as pd
import netCDF4
import itertools
import multiprocessing as mpp
from tqdm import tqdm
Class catch2grid(object):
def __init__(self):
"""Init of catch2grid."""
self.pbar = None
...
def main(self, db_Qobs_meta_dir, ens_mean_dir, ens_sd_dir, db_Qobs_dir,
range_start, range_end):
"""Sequential computing of several flow percentiles for Qobs and Qsim,
the standard deviation of the flow percentiles of Qsim and the
KGE alpha.
db_Qobs_meta_dir -- Path to file with meta informations on the
Catchments
ens_mean_dir -- Path to file with runoff ensemble mean
ens_sd_dir -- Path to file with runoff ensemble standard deviation
db_Qobs_dir -- Path to folder with observed runoff database_Qobs_new
range_start -- starting value of range
range_end -- stopping value of range
"""
range_catch = range(range_start, range_end)
df_meta = self.import_meta(db_Qobs_meta_dir)
df_meta = self.select_catchments(df_meta)
Ens_mean, Ens_mean_Q = self.import_ens_mean(ens_mean_dir)
Ens_sd, Ens_sd_Q = self.import_ens_sd(ens_sd_dir)
Grid_lats_cen, Grid_lons_cen = self.grid_cen_arr(Ens_mean)
df_Qobs_percs = pd.DataFrame(index=range_catch, columns=
['Catch_name', 't_scale_Qobs', 'Time_cov',
'Q_5', 'Q_25', 'Q_50',
'Q_75', 'Q_95'])
df_Qsim_percs = pd.DataFrame(index=range_catch, columns=
['Catch_name', 'Q_5', 'Q_25', 'Q_50',
'Q_75', 'Q_95'])
df_sdQsim_percs = pd.DataFrame(index=range_catch, columns=
['Catch_name', 'sdQsim_5', 'sdQsim_25',
'sdQsim_50', 'sdQsim_75', 'sdQsim_95'])
df_KGE_alpha = pd.DataFrame(index=range_catch, columns=['KGE_alpha'])
df_Qobs_percs['Catch_name'] = df_meta['Catchments']\
[range_catch[0]:range_catch[-1]+1]
df_Qsim_percs['Catch_name'] = df_meta['Catchments']\
[range_catch[0]:range_catch[-1]+1]
df_sdQsim_percs['Catch_name'] = df_meta['Catchments']\
[range_catch[0]:range_catch[-1]+1]
df_KGE_alpha['Catch_name'] = df_meta['Catchments']\
[range_catch[0]:range_catch[-1]+1]
for k in range_catch:
sum_Lat_bool, sum_Lon_bool, Lat_idx, Lon_idx = self.matchgrid(df_meta,
db_Qobs_dir,
Grid_lats_cen,
Grid_lons_cen,
k)
df_Q, t_scale_Qobs = self.Qsim_to_catch(df_meta, db_Qobs_dir,
Ens_mean, Ens_mean_Q,
sum_Lat_bool, sum_Lon_bool,
Lat_idx, Lon_idx, k)
df_sdQsim = self.sdQsim_to_catch(df_meta, db_Qobs_dir, Ens_sd,
Ens_sd_Q, sum_Lat_bool,
sum_Lon_bool, Lat_idx, Lon_idx, k)
df_Qobs_percs['t_scale_Qobs'][k] = t_scale_Qobs
no_NAs = df_Q['Qobs'].isnull().sum().sum()
df_Qobs_percs['Time_cov'][k] = 1 - (no_NAs/len(df_Q.index))
df_Qobs_percs['Q_95'][k] = self.flow_perc(df_Q['Qobs'], perc=95)
df_Qobs_percs['Q_75'][k] = self.flow_perc(df_Q['Qobs'], perc=75)
df_Qobs_percs['Q_50'][k] = self.flow_perc(df_Q['Qobs'], perc=50)
df_Qobs_percs['Q_25'][k] = self.flow_perc(df_Q['Qobs'], perc=25)
df_Qobs_percs['Q_5'][k] = self.flow_perc(df_Q['Qobs'], perc=5)
df_Qsim_percs['Q_95'][k] = self.flow_perc(df_Q['Qsim'], perc=95)
df_Qsim_percs['Q_75'][k] = self.flow_perc(df_Q['Qsim'], perc=75)
df_Qsim_percs['Q_50'][k] = self.flow_perc(df_Q['Qsim'], perc=50)
df_Qsim_percs['Q_25'][k] = self.flow_perc(df_Q['Qsim'], perc=25)
df_Qsim_percs['Q_5'][k] = self.flow_perc(df_Q['Qsim'], perc=5)
df_sdQsim_percs['sdQsim_95'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=95)
df_sdQsim_percs['sdQsim_75'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=75)
df_sdQsim_percs['sdQsim_50'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=50)
df_sdQsim_percs['sdQsim_25'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=25)
df_sdQsim_percs['sdQsim_5'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=5)
df_KGE_alpha['KGE_alpha'][k] = self.KGE_alpha(df_Q['Qsim'], df_Q['Qobs'])
# display progress
self.pbar.update(1)
df_Qobs_percs.index = df_Qobs_percs['Catch_name']
df_Qsim_percs.index = df_Qsim_percs['Catch_name']
df_sdQsim_percs.index = df_sdQsim_percs['Catch_name']
df_KGE_alpha.index = df_KGE_alpha['Catch_name']
df_Qobs_percs = df_Qobs_percs.loc[:, 'Q_5':'Q_95']
df_Qsim_percs = df_Qsim_percs.loc[:, 'Q_5':'Q_95']
df_sdQsim_percs = df_sdQsim_percs.loc[:, 'sdQsim_5':'sdQsim_95']
df_KGE_alpha = df_KGE_alpha.loc[:, 'KGE_alpha']
return df_Qobs_percs, df_Qsim_percs, df_sdQsim_percs, df_KGE_alpha
def main_par(self, db_Qobs_meta_dir, ens_mean_dir, ens_sd_dir, db_Qobs_dir):
"""Parallel computing of several flow percentiles for Qobs and Qsim,
the standard deviation of the flow percentiles of Qsim and the
KGE alpha.
db_Qobs_meta_dir -- Path to file with meta informations on the
Catchments
ens_mean_dir -- Path to file with runoff ensemble mean
ens_sd_dir -- Path to file with runoff ensemble standard deviation
db_Qobs_dir -- Path to folder with observed runoff database_Qobs_new
"""
cpu_cores = mpp.cpu_count() - 1
df_meta = self.import_meta(db_Qobs_meta_dir)
df_meta = self.select_catchments(df_meta)
# chunking subsets for parallelization
ll_start = []
ll_end = []
lin_dist = np.linspace(0, len(df_meta.index), cpu_cores+1)
l = len(lin_dist)
# list of tuples with input arguments for map
for i in range(len(lin_dist) - 1):
temp = list(range(int(lin_dist[i]), int(lin_dist[i+1]), 1))
ll_start.append(temp[0])
ll_end.append(temp[-1]+1)
ll_db_Qobs_meta_dir = list(itertools.repeat(db_Qobs_meta_dir, l))
ll_Ens_mean_dir = list(itertools.repeat(ens_mean_dir, l))
ll_Ens_sd_dir = list(itertools.repeat(ens_sd_dir, l))
ll_db_Qobs_dir = list(itertools.repeat(db_Qobs_dir, l))
subsets = zip(ll_db_Qobs_meta_dir, ll_Ens_mean_dir, ll_Ens_sd_dir,
ll_db_Qobs_dir, ll_start, ll_end)
p = mpp.Pool(cpu_cores) # launch pool of workers
res = p.starmap(self.main, subsets)
p.close()
p.join()
res_obs = []
res_sim = []
res_simsd = []
res_kgealpha = []
# collect dataframes and merge them
[res_obs.append(res[:][i][0]) for i in range(len(res))]
[res_sim.append(res[:][i][1]) for i in range(len(res))]
[res_simsd.append(res[:][i][2]) for i in range(len(res))]
[res_kgealpha.append(res[:][i][3]) for i in range(len(res))]
df_Qobs_percs = pd.concat(res_obs[:], ignore_index=True)
df_Qsim_percs = pd.concat(res_sim[:], ignore_index=True)
df_sdQsim_percs = pd.concat(res_simsd[:], ignore_index=True)
df_KGE_alpha = pd.concat(res_kgealpha[:], ignore_index=True)
return df_Qobs_percs, df_Qsim_percs, df_sdQsim_percs, df_KGE_alpha
...
if __name__ == "__main__":
cpu_cores = mp.cpu_count() - 1
c2g = catch2grid()
p = mp.Pool(cpu_cores) # launch pool of workers
c2g.init_pbar(l)
ll_range_catch = list(range(0, 5000))
res = p.map(c2g.main_par, ll_range_catch)
p.close()
p.join()
运行后会显示以下错误消息:
File "<ipython-input-1-3828921ab3bd>", line 1, in <module>
runfile('/Users/robinschwemmle/Desktop/MSc_Thesis/Python/catch2grid.py', wdir='/Users/robinschwemmle/Desktop/MSc_Thesis/Python')
File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/Users/robinschwemmle/Desktop/MSc_Thesis/Python/catch2grid.py", line 1285, in <module>
c2g.main_par(db_Qobs_meta_dir, Ens_mean_dir, Ens_sd_dir, db_Qobs_dir)
File "/Users/robinschwemmle/Desktop/MSc_Thesis/Python/catch2grid.py", line 798, in main_par
res = p.starmap(self.main, subsets)
File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/pool.py", line 274, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/pool.py", line 644, in get
raise self._value
File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/pool.py", line 424, in _handle_tasks
put(task)
File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
PicklingError: Can't pickle <function <lambda> at 0x1164e42f0>: attribute lookup <lambda> on jupyter_client.session failed
几天前发生错误。在代码正常工作之前。我不知道多处理或酸洗库有任何变化吗?或者有人向我建议我可以选择哪个并行库?