Python多处理池starmap比顺序慢(非平凡的独立函数)

时间:2017-06-13 04:15:31

标签: python performance numpy multiprocessing pool

我在具有8个内核的i7上使用Python 3.6来天真地解决网格上的微分方程。函数(如下所示)使用numpy(求和,乘法和点积)进行一系列向量化计算,并使用运行总和在每个时间步执行for循环更新值。

def dynamics(cParams, gParams, sParams):
# takes parameters, performs dynamics, and outputs desired observables
[P, aIBi] = cParams
[kcutoff, dk, Ntheta, dtheta, tMax, dt] = gParams
[mI, mB, n0, gBB] = sParams

kVec = np.arange(dk, kcutoff, dk)
thetaVec = np.arange(dtheta, np.pi, dtheta)
tVec = np.arange(0, tMax, dt)

# initial conditions
Bk0_mat = np.zeros((thetaVec.size, kVec.size), dtype=complex)
Bk0_V = Bk0_mat.reshape(thetaVec.size * kVec.size)
phi0 = 0 + 0j

# precomputing things that only depend on k,theta and not t
Omega0K = omega0_k(kVec, gBB, mI, mB, n0)
Wkv = Wk(kVec, gBB, mB, n0)
gnum = g(aIBi, kcutoff, gBB, mI, mB, n0)
thetaones = np.ones(thetaVec.size)

Omega0K_mat = np.outer(thetaones, Omega0K)
Wk_mat = np.outer(thetaones, Wkv)
dV_mat = (2 * np.pi / (2 * np.pi)**3) * np.outer(dtheta * np.sin(thetaVec), dk * kVec**2)
kcos_mat = np.outer(np.cos(thetaVec), kVec)

Omega0K_Vec = Omega0K_mat.reshape(thetaVec.size * kVec.size)
Wk_Vec = Wk_mat.reshape(thetaVec.size * kVec.size)
Wki_Vec = 1 / Wk_Vec
dV_Vec = dV_mat.reshape(thetaVec.size * kVec.size)
kcos_Vec = kcos_mat.reshape(thetaVec.size * kVec.size)

# calculate differential equation

# setting initial beta vector and initializing matrices
Bkt = Bk0_V
phit = phi0

PB_Vec = np.zeros(tVec.size, dtype=float)
phi_Vec = np.zeros(tVec.size, dtype=complex)
NB_Vec = np.zeros(tVec.size, dtype=float)

for ind, t in enumerate(tVec):
    # keep track of quantities we care about (storing data)

    PBt = PB(Bkt, kcos_Vec, dV_Vec, gBB, mB, n0)
    PB_Vec[ind] = PBt
    # print(PBt)
    phi_Vec[ind] = phit

    NBt = np.dot(Bkt * np.conjugate(Bkt), dV_Vec)
    NB_Vec[ind] = NBt
    # print(NBt)

    # calculate some useful quantities that will be useful later in the loop

    xpt = pchi(Bkt, Wk_Vec, dV_Vec, gBB, mB, n0)
    xmt = mchi(Bkt, Wki_Vec, dV_Vec, gBB, mB, n0)

    # update Bkt and ast to the t+1 value

    BDiff = -1j * (gnum * np.sqrt(n0) * Wk_Vec + Bkt * (Omega0K_Vec - kcos_Vec * (P - PB_Vec[ind]) / mI) + gnum * (Wk_Vec * xpt + Wki_Vec * xmt))
    phiDiff = gnum * n0 + gnum * np.sqrt(n0) * xpt + (P**2 - PB_Vec[ind]**2) / (2 * mI)
    Bkt = Bkt + dt * BDiff
    phit = phit + dt * phiDiff

    # print([PBt, xpt, xmt])

S_Vec = dynOverlap(NB_Vec, phi_Vec)
freqVec, A_Vec = spectFunc(S_Vec, tVec)

# save data
tfData = [tVec, freqVec]
paramData = [cParams, gParams, sParams]
obData = [PB_Vec, NB_Vec, S_Vec, A_Vec]
data = [paramData, tfData, obData]

# return data
# dirpath = os.path.dirname(os.path.realpath(__file__))
# np.save(dirpath + '/pdata/fmquench_aIBi:%.2f_P:%.2f.npy' % (aIBi, P), data)
return data

当我尝试使用pool.starmap vs serialally(使用for循环或itertools.starmap)更改输入参数多次运行该函数时,串行计算速度明显加快。在下面的示例中,我将函数调用7次,对于定义的网格参数(未显示),每次串行运行大约需要85秒,总计约为512秒。当我使用所有8个处理器的池时,运行时间约为705秒。据我所知,不同的函数调用没有通信,每个函数调用所做的工作都很重要,所以我不明白为什么多处理速度较慢。我做错了什么/哪些瓶颈会导致代码导致高IPC开销/什么使多处理速度变慢?

# create iterable over all tuples of function arguments for dynamics()

paramsIter = zip(cParams_List, it.repeat(gParams), it.repeat(sParams))

# compute data (parallel)

start = timer()

with mp.Pool() as pool:
    # pool = mp.Pool()
    pool.starmap(dynamics, paramsIter)
    # pool.close()
    # pool.join()

end = timer()
print(end - start)

# compute data (serial) - for loop

# start = timer()

# for z in paramsIter:
#     dynamics(*z)

# end = timer()
# print(end-start)

# compute data (serial) - starmap

# start = timer()

# for i in it.starmap(dynamics, paramsIter):
#     i

# end = timer()
# print(end - start)

0 个答案:

没有答案