我正在尝试对一个非常大的熊猫拆分数据帧使用多重处理,为每个拆分迭代处理。 但是,如果出现泡菜错误,则需要在单独的进程中同时运行每个拆分数据帧的帮助
import pandas as pd,glob,os
from multiprocessing import Pool, process
import numpy as np
if __name__ == '__main__':
file_or_path = r"H:\Template_for_BulkUpload\UK"
col_mapping = "co11:sfx_afc,col2:sgr_afc"
slist1, dlist2 = zip(*map(lambda s: s.split(':'), col_mapping.split(',')))
slist = []
dlist = []
slist[0:] = slist1
dlist[0:] = dlist2
all_files = glob.glob(os.path.join(file_or_path, "*.*"))
df_frames = (pd.read_excel(f, sheet_name=0, header=0, keep_default_na=False,
usecols=slist1,
comment='usercols expect a list of columns (example:[col1,col2)')
for f in all_files
)
df = pd.concat(df_frames, ignore_index=True)
df.columns = dlist2 # rename column in data frame with destination column names
p = Pool(processes=4)
pool_results=p.map(process, np.array_split(df, 4))
错误:
> Traceback (most recent call last): File
> "E:/PythonScripts/PyCharm/PythonScripts_withPyCharm/DataIngestionScripts/Threading_test.py",
> line 32, in <module>
> pool_results=p.map(process, np.array_split(df, 4)) File "C:\python_customize_install_location\lib\multiprocessing\pool.py",
> line 364, in map
> return self._map_async(func, iterable, mapstar, chunksize).get() File
> "C:\python_customize_install_location\lib\multiprocessing\pool.py",
> line 768, in get
> raise self._value File "C:\python_customize_install_location\lib\multiprocessing\pool.py",
> line 537, in _handle_tasks
> put(task) File "C:\python_customize_install_location\lib\multiprocessing\connection.py",
> line 206, in send
> self._send_bytes(_ForkingPickler.dumps(obj)) File "C:\python_customize_install_location\lib\multiprocessing\reduction.py",
> line 51, in dumps
> cls(buf, protocol).dump(obj) TypeError: cannot pickle 'module' object