我正在尝试使用并行化来加速我在Julia中的代码,但是我的代码运行得更快而没有并行化(serial for loops)。下面,我按照Julia文档中的简单示例,串行代码比并行代码快得多。 (添加两个矩阵q和u)
@everywhere function myrange(q::SharedArray)
idx = indexpids(q)
if idx == 0
# This worker is not assigned a piece
return 1:0, 1:0
end
nchunks = length(procs(q))
splits = [round(Int, s) for s in linspace(0,size(q,2),nchunks+1)]
1:size(q,1), splits[idx]+1:splits[idx+1]
end
# Here's the kernel
@everywhere function advection_chunk!(q, u, irange, jrange)
@show (irange, jrange) # display so we can see what's happening
for j in jrange, i in irange
q[i,j] = q[i,j] + u[i,j]
end
q
end
advection_serial!(q, u) = advection_chunk!(q, u, 1:size(q,1), 1:size(q,2))
# Here's a convenience wrapper for a SharedArray implementation
@everywhere advection_shared_chunk!(q, u) = advection_chunk!(q, u, myrange(q)...)
function advection_shared!(q, u)
@sync begin
for p in procs(q)
@async remotecall_wait(advection_shared_chunk!, p, q, u)
end
end
q
end
使用这些函数,我使用@time来比较计算时间
q = SharedArray(Float64, (5,10000))
u = SharedArray(Float64, (5,10000))
advection_serial!(q, u)
advection_shared!(q, u) # use @time after running both once
然后我得到了一些奇怪的结果......
1)串行代码比并行代码快40倍(添加3名工作人员后:0.000576秒v.s. 0.023328秒)
2)当我添加更多工作者时,并行代码会变慢。
我做错了什么吗?无论如何,我可以用这个例子更快地制作并行代码吗?