我正在尝试使用Multiprocessing
加快pandas
apply()
。
import multiprocessing
import pandas as pd
import numpy as np
from multiprocessing import Pool
num_partitions = 5
num_cores = multiprocessing.cpu_count()
def parallelize_dataframe(df, func):
a,b,c,d,e = np.array_split(df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(func, [a,b,c,d,e]))
pool.close()
pool.join()
return df
def square(x):
return x**2
def test_func(data):
print("Process working on: ",data)
data["square"] = data["col"].apply(square)
return data
df = pd.DataFrame({'col': [0,1,2,3,4,5,6,7,8,9]})
if __name__ == '__main__':
test = parallelize_dataframe(df, test_func)
print(test)
导致此错误:Can't Get Attribute 'test_func' on <module '__main__' (built-in)>
。我知道这可能是某种循环导入的结果,但我不确定在我的代码中要改变什么。为什么会出现此错误?