我试图在python中使用joblib来加速某些数据处理,但是我在尝试解决如何将输出分配到所需格式时遇到问题。我试图生成一个可能过于简单化的代码,它显示了我遇到的问题:
from joblib import Parallel, delayed
import numpy as np
def main():
print "Nested loop array assignment:"
regular()
print "Parallel nested loop assignment using a single process:"
par2(1)
print "Parallel nested loop assignment using multiple process:"
par2(2)
def regular():
# Define variables
a = [0,1,2,3,4]
b = [0,1,2,3,4]
# Set array variable to global and define size and shape
global ab
ab = np.zeros((2,np.size(a),np.size(b)))
# Iterate to populate array
for i in range(0,np.size(a)):
for j in range(0,np.size(b)):
func(i,j,a,b)
# Show array output
print ab
def par2(process):
# Define variables
a2 = [0,1,2,3,4]
b2 = [0,1,2,3,4]
# Set array variable to global and define size and shape
global ab2
ab2 = np.zeros((2,np.size(a2),np.size(b2)))
# Parallel process in order to populate array
Parallel(n_jobs=process)(delayed(func2)(i,j,a2,b2) for i in xrange(0,np.size(a2)) for j in xrange(0,np.size(b2)))
# Show array output
print ab2
def func(i,j,a,b):
# Populate array
ab[0,i,j] = a[i]+b[j]
ab[1,i,j] = a[i]*b[j]
def func2(i,j,a2,b2):
# Populate array
ab2[0,i,j] = a2[i]+b2[j]
ab2[1,i,j] = a2[i]*b2[j]
# Run script
main()
其输出如下:
Nested loop array assignment:
[[[ 0. 1. 2. 3. 4.]
[ 1. 2. 3. 4. 5.]
[ 2. 3. 4. 5. 6.]
[ 3. 4. 5. 6. 7.]
[ 4. 5. 6. 7. 8.]]
[[ 0. 0. 0. 0. 0.]
[ 0. 1. 2. 3. 4.]
[ 0. 2. 4. 6. 8.]
[ 0. 3. 6. 9. 12.]
[ 0. 4. 8. 12. 16.]]]
Parallel nested loop assignment using a single process:
[[[ 0. 1. 2. 3. 4.]
[ 1. 2. 3. 4. 5.]
[ 2. 3. 4. 5. 6.]
[ 3. 4. 5. 6. 7.]
[ 4. 5. 6. 7. 8.]]
[[ 0. 0. 0. 0. 0.]
[ 0. 1. 2. 3. 4.]
[ 0. 2. 4. 6. 8.]
[ 0. 3. 6. 9. 12.]
[ 0. 4. 8. 12. 16.]]]
Parallel nested loop assignment using multiple process:
[[[ 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0.]]
[[ 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0.]]]
从Google和StackOverflow搜索功能看来,当使用joblib时,不会在每个子进程之间共享全局数组。我不确定这是否是对joblib的限制,或者是否有办法解决这个问题?
实际上,我的脚本被其他代码所包围,这些代码依赖于此全局数组的最终输出,其格式为(4, x , x )格式,其中 x 是可变的(但通常在100到数千的范围内)。这是我目前查看并行处理的原因,因为 x = 2400整个过程最多可能需要2个小时。
使用joblib不是必需的(但我喜欢命名和简单),所以请随意提出简单的替代方法,理想情况下要记住最终数组的要求。我正在使用python 2.7.3和joblib 0.7.1。
答案 0 :(得分:4)
我能够使用numpy的memmap解决这个简单示例的问题。 在使用memmap并遵循joblib documentation webpage上的示例后我仍然遇到问题,但是我通过pip升级到最新的joblib版本(0.9.3)并且它运行顺利。这是工作代码:
from joblib import Parallel, delayed
import numpy as np
import os
import tempfile
import shutil
def main():
print "Nested loop array assignment:"
regular()
print "Parallel nested loop assignment using numpy's memmap:"
par3(4)
def regular():
# Define variables
a = [0,1,2,3,4]
b = [0,1,2,3,4]
# Set array variable to global and define size and shape
global ab
ab = np.zeros((2,np.size(a),np.size(b)))
# Iterate to populate array
for i in range(0,np.size(a)):
for j in range(0,np.size(b)):
func(i,j,a,b)
# Show array output
print ab
def par3(process):
# Creat a temporary directory and define the array path
path = tempfile.mkdtemp()
ab3path = os.path.join(path,'ab3.mmap')
# Define variables
a3 = [0,1,2,3,4]
b3 = [0,1,2,3,4]
# Create the array using numpy's memmap
ab3 = np.memmap(ab3path, dtype=float, shape=(2,np.size(a3),np.size(b3)), mode='w+')
# Parallel process in order to populate array
Parallel(n_jobs=process)(delayed(func3)(i,a3,b3,ab3) for i in xrange(0,np.size(a3)))
# Show array output
print ab3
# Delete the temporary directory and contents
try:
shutil.rmtree(path)
except:
print "Couldn't delete folder: "+str(path)
def func(i,j,a,b):
# Populate array
ab[0,i,j] = a[i]+b[j]
ab[1,i,j] = a[i]*b[j]
def func3(i,a3,b3,ab3):
# Populate array
for j in range(0,np.size(b3)):
ab3[0,i,j] = a3[i]+b3[j]
ab3[1,i,j] = a3[i]*b3[j]
# Run script
main()
给出以下结果:
Nested loop array assignment:
[[[ 0. 1. 2. 3. 4.]
[ 1. 2. 3. 4. 5.]
[ 2. 3. 4. 5. 6.]
[ 3. 4. 5. 6. 7.]
[ 4. 5. 6. 7. 8.]]
[[ 0. 0. 0. 0. 0.]
[ 0. 1. 2. 3. 4.]
[ 0. 2. 4. 6. 8.]
[ 0. 3. 6. 9. 12.]
[ 0. 4. 8. 12. 16.]]]
Parallel nested loop assignment using numpy's memmap:
[[[ 0. 1. 2. 3. 4.]
[ 1. 2. 3. 4. 5.]
[ 2. 3. 4. 5. 6.]
[ 3. 4. 5. 6. 7.]
[ 4. 5. 6. 7. 8.]]
[[ 0. 0. 0. 0. 0.]
[ 0. 1. 2. 3. 4.]
[ 0. 2. 4. 6. 8.]
[ 0. 3. 6. 9. 12.]
[ 0. 4. 8. 12. 16.]]]
我想为未来的读者留下一些想法:
np.arange(0,10000)
, b 和 b3 至np.arange(0,1000)
“常规”方法的时间为12.4s,joblib的时间为7.7s
方法答案 1 :(得分:0)
我正在使用的joblib
版本( 0.13.2
),实际上使我能够轻松访问大型共享DataFrames
。
当然,DataFrames
需要在并行循环开始之前进行预先分配,并且每个线程必须仅访问DataFrame
的一部分才能进行写入,但是它可以工作。
data = pd.DataFrame(...)
stats = pd.DataFrame(np.nan, index=np.arange(0, size/step), columns=cols, dtype=np.float64)
Parallel(n_jobs=8, prefer='threads')(
delayed(_threadsafe_func)(data, stats, i, step, other_params)
for i in range(0, size, step))
然后在_threadsafe_func
内部,可以通过以下方式读取或写入 stats
DataFrame
:
index = i/step
print('[' + str(i) + '] Running job with index:', str(int(index)), '/', len(data)/step)
chunk = data[i:i + step]
stats.loc[index, 'mean'] = chunk.mean() # 'mean' is an existing column already filled with np.nan