使用多处理时,无法pickle <function <lambda =“”> Pool.map()

时间:2018-03-10 14:08:04

标签: parallel-processing multiprocessing pickle python-3.6 python-multiprocessing

我正在尝试将我的python脚本与多处理库并行化。我的函数是类的一部分,我使用Pool.map

import numpy as np
import pandas as pd
import netCDF4
import itertools
import multiprocessing as mpp
from tqdm import tqdm

Class catch2grid(object):

def __init__(self):
    """Init of catch2grid."""
    self.pbar = None

...

def main(self, db_Qobs_meta_dir, ens_mean_dir, ens_sd_dir, db_Qobs_dir, 
         range_start, range_end):
    """Sequential computing of several flow percentiles for Qobs and Qsim,
    the standard deviation of the flow percentiles of Qsim and the
    KGE alpha.

    db_Qobs_meta_dir -- Path to file with meta informations on the
                        Catchments
    ens_mean_dir -- Path to file with runoff ensemble mean
    ens_sd_dir -- Path to file with runoff ensemble standard deviation
    db_Qobs_dir -- Path to folder with observed runoff database_Qobs_new
    range_start -- starting value of range
    range_end -- stopping value of range
    """
    range_catch = range(range_start, range_end)
    df_meta = self.import_meta(db_Qobs_meta_dir)
    df_meta = self.select_catchments(df_meta)
    Ens_mean, Ens_mean_Q = self.import_ens_mean(ens_mean_dir)
    Ens_sd, Ens_sd_Q = self.import_ens_sd(ens_sd_dir)
    Grid_lats_cen, Grid_lons_cen = self.grid_cen_arr(Ens_mean)

    df_Qobs_percs = pd.DataFrame(index=range_catch, columns=
                                ['Catch_name', 't_scale_Qobs', 'Time_cov',
                                 'Q_5', 'Q_25', 'Q_50',
                                 'Q_75', 'Q_95'])
    df_Qsim_percs = pd.DataFrame(index=range_catch, columns=
                                ['Catch_name', 'Q_5', 'Q_25', 'Q_50',
                                 'Q_75', 'Q_95'])
    df_sdQsim_percs = pd.DataFrame(index=range_catch, columns=
                                   ['Catch_name', 'sdQsim_5', 'sdQsim_25',
                                    'sdQsim_50', 'sdQsim_75', 'sdQsim_95'])                   
    df_KGE_alpha = pd.DataFrame(index=range_catch, columns=['KGE_alpha'])

    df_Qobs_percs['Catch_name'] = df_meta['Catchments']\
                                             [range_catch[0]:range_catch[-1]+1]
    df_Qsim_percs['Catch_name'] = df_meta['Catchments']\
                                             [range_catch[0]:range_catch[-1]+1]
    df_sdQsim_percs['Catch_name'] = df_meta['Catchments']\
                                                 [range_catch[0]:range_catch[-1]+1]
    df_KGE_alpha['Catch_name'] = df_meta['Catchments']\
                                                 [range_catch[0]:range_catch[-1]+1]

    for k in range_catch:
        sum_Lat_bool, sum_Lon_bool, Lat_idx, Lon_idx = self.matchgrid(df_meta, 
                                                                      db_Qobs_dir, 
                                                                      Grid_lats_cen, 
                                                                      Grid_lons_cen, 
                                                                      k)
        df_Q, t_scale_Qobs = self.Qsim_to_catch(df_meta, db_Qobs_dir, 
                                                Ens_mean, Ens_mean_Q, 
                                                sum_Lat_bool, sum_Lon_bool, 
                                                Lat_idx, Lon_idx, k)
        df_sdQsim = self.sdQsim_to_catch(df_meta, db_Qobs_dir, Ens_sd, 
                                         Ens_sd_Q, sum_Lat_bool, 
                                         sum_Lon_bool, Lat_idx, Lon_idx, k)
        df_Qobs_percs['t_scale_Qobs'][k] = t_scale_Qobs
        no_NAs = df_Q['Qobs'].isnull().sum().sum()
        df_Qobs_percs['Time_cov'][k] = 1 - (no_NAs/len(df_Q.index))
        df_Qobs_percs['Q_95'][k] = self.flow_perc(df_Q['Qobs'], perc=95)
        df_Qobs_percs['Q_75'][k] = self.flow_perc(df_Q['Qobs'], perc=75)
        df_Qobs_percs['Q_50'][k] = self.flow_perc(df_Q['Qobs'], perc=50)
        df_Qobs_percs['Q_25'][k] = self.flow_perc(df_Q['Qobs'], perc=25)
        df_Qobs_percs['Q_5'][k] = self.flow_perc(df_Q['Qobs'], perc=5)

        df_Qsim_percs['Q_95'][k] = self.flow_perc(df_Q['Qsim'], perc=95)
        df_Qsim_percs['Q_75'][k] = self.flow_perc(df_Q['Qsim'], perc=75)
        df_Qsim_percs['Q_50'][k] = self.flow_perc(df_Q['Qsim'], perc=50)
        df_Qsim_percs['Q_25'][k] = self.flow_perc(df_Q['Qsim'], perc=25)
        df_Qsim_percs['Q_5'][k] = self.flow_perc(df_Q['Qsim'], perc=5)

        df_sdQsim_percs['sdQsim_95'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=95)
        df_sdQsim_percs['sdQsim_75'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=75)
        df_sdQsim_percs['sdQsim_50'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=50)
        df_sdQsim_percs['sdQsim_25'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=25)
        df_sdQsim_percs['sdQsim_5'][k] = self.flow_perc_sd(df_Q['Qsim'], df_sdQsim['sdQsim'], perc=5)

        df_KGE_alpha['KGE_alpha'][k] = self.KGE_alpha(df_Q['Qsim'], df_Q['Qobs'])

        # display progress
        self.pbar.update(1)

    df_Qobs_percs.index = df_Qobs_percs['Catch_name']
    df_Qsim_percs.index = df_Qsim_percs['Catch_name']
    df_sdQsim_percs.index = df_sdQsim_percs['Catch_name']
    df_KGE_alpha.index = df_KGE_alpha['Catch_name']
    df_Qobs_percs = df_Qobs_percs.loc[:, 'Q_5':'Q_95']
    df_Qsim_percs = df_Qsim_percs.loc[:, 'Q_5':'Q_95']
    df_sdQsim_percs = df_sdQsim_percs.loc[:, 'sdQsim_5':'sdQsim_95']
    df_KGE_alpha = df_KGE_alpha.loc[:, 'KGE_alpha']

    return df_Qobs_percs, df_Qsim_percs, df_sdQsim_percs, df_KGE_alpha

def main_par(self, db_Qobs_meta_dir, ens_mean_dir, ens_sd_dir, db_Qobs_dir):
    """Parallel computing of several flow percentiles for Qobs and Qsim,
    the standard deviation of the flow percentiles of Qsim and the
    KGE alpha.

    db_Qobs_meta_dir -- Path to file with meta informations on the
                        Catchments
    ens_mean_dir -- Path to file with runoff ensemble mean
    ens_sd_dir -- Path to file with runoff ensemble standard deviation
    db_Qobs_dir -- Path to folder with observed runoff database_Qobs_new
    """
    cpu_cores = mpp.cpu_count() - 1
    df_meta = self.import_meta(db_Qobs_meta_dir)
    df_meta = self.select_catchments(df_meta)

    # chunking subsets for parallelization
    ll_start = []
    ll_end = []
    lin_dist = np.linspace(0, len(df_meta.index), cpu_cores+1)
    l = len(lin_dist)
    # list of tuples with input arguments for map
    for i in range(len(lin_dist) - 1):
        temp = list(range(int(lin_dist[i]), int(lin_dist[i+1]), 1))
        ll_start.append(temp[0])
        ll_end.append(temp[-1]+1)

    ll_db_Qobs_meta_dir = list(itertools.repeat(db_Qobs_meta_dir, l))
    ll_Ens_mean_dir = list(itertools.repeat(ens_mean_dir, l))
    ll_Ens_sd_dir = list(itertools.repeat(ens_sd_dir, l))
    ll_db_Qobs_dir = list(itertools.repeat(db_Qobs_dir, l))

    subsets = zip(ll_db_Qobs_meta_dir, ll_Ens_mean_dir, ll_Ens_sd_dir, 
                  ll_db_Qobs_dir, ll_start, ll_end)

    p = mpp.Pool(cpu_cores)  # launch pool of workers
    res = p.starmap(self.main, subsets)
    p.close()
    p.join()
    res_obs = []
    res_sim = []
    res_simsd = []
    res_kgealpha = []
    # collect dataframes and merge them
    [res_obs.append(res[:][i][0]) for i in range(len(res))]
    [res_sim.append(res[:][i][1]) for i in range(len(res))]
    [res_simsd.append(res[:][i][2]) for i in range(len(res))]
    [res_kgealpha.append(res[:][i][3]) for i in range(len(res))]
    df_Qobs_percs = pd.concat(res_obs[:], ignore_index=True)
    df_Qsim_percs = pd.concat(res_sim[:], ignore_index=True)
    df_sdQsim_percs = pd.concat(res_simsd[:], ignore_index=True)
    df_KGE_alpha = pd.concat(res_kgealpha[:], ignore_index=True)

    return df_Qobs_percs, df_Qsim_percs, df_sdQsim_percs, df_KGE_alpha
...

if __name__ == "__main__":
    cpu_cores = mp.cpu_count() - 1
    c2g = catch2grid()

    p = mp.Pool(cpu_cores)  # launch pool of workers
    c2g.init_pbar(l)
    ll_range_catch = list(range(0, 5000))
    res = p.map(c2g.main_par, ll_range_catch)
    p.close()
    p.join()

运行后会显示以下错误消息:

File "<ipython-input-1-3828921ab3bd>", line 1, in <module>
            runfile('/Users/robinschwemmle/Desktop/MSc_Thesis/Python/catch2grid.py', wdir='/Users/robinschwemmle/Desktop/MSc_Thesis/Python')

File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 705, in runfile
execfile(filename, namespace)

File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)

File "/Users/robinschwemmle/Desktop/MSc_Thesis/Python/catch2grid.py", line 1285, in <module>
c2g.main_par(db_Qobs_meta_dir, Ens_mean_dir, Ens_sd_dir, db_Qobs_dir)

File "/Users/robinschwemmle/Desktop/MSc_Thesis/Python/catch2grid.py", line 798, in main_par
res = p.starmap(self.main, subsets)

File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/pool.py", line 274, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()

File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/pool.py", line 644, in get
raise self._value

File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/pool.py", line 424, in _handle_tasks
put(task)

File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))

File "/Users/robinschwemmle/anaconda/envs/py36/lib/python3.6/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)

PicklingError: Can't pickle <function <lambda> at 0x1164e42f0>: attribute lookup <lambda> on jupyter_client.session failed

几天前发生错误。在代码正常工作之前。我不知道多处理或酸洗库有任何变化吗?或者有人向我建议我可以选择哪个并行库?

0 个答案:

没有答案