Julia阵列沿维度并行统计

时间:2016-03-08 03:14:00

标签: parallel-processing julia

Julia中有哪些最佳实践可以并行获取给定维度的数组统计信息?我有很多大型数组,我正在寻找像mean(array, 1)这样的东西,但是并行(并返回分位数)。我无法并行处理数组,因为我没有足够的RAM。

我编写了一个粗略的基准,也说明了我迄今为止尝试过的方法:mapslices@parallel循环SharedArray s和DArray s(见下文) )。并行化似乎并没有加快速度。添加7个工作器并使用SharedArrays可以实现1.8倍的加速,使用DArrays可以获得2.3倍的加速。我对Julia很新。这是预期的吗?我做错了吗?

感谢您的帮助。下面是我的脚本输出,后跟脚本本身。

脚本输出:

WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
WARNING: replacing module DistributedArrays
mapslices on Array
 38.152894 seconds (218.71 M allocations: 14.435 GB, 3.33% gc time)
 37.985577 seconds (218.10 M allocations: 14.406 GB, 3.23% gc time)
loop over Array using CartesianRange
  9.161392 seconds (25.27 M allocations: 9.005 GB, 4.41% gc time)
  9.118627 seconds (25.17 M allocations: 9.000 GB, 4.40% gc time)
@parallel loop over SharedArray
  9.092477 seconds (322.23 k allocations: 14.190 MB, 0.05% gc time)
  4.945648 seconds (18.90 k allocations: 1.405 MB)
@parallel loop over DArray
  5.615429 seconds (496.26 k allocations: 21.535 MB, 0.08% gc time)
  3.932704 seconds (15.63 k allocations: 1.178 MB)

脚本:

procs_added = addprocs(CPU_CORES - 1)
@everywhere using DistributedArrays

function benchmark_array(dtype, dims)
    data = rand(dtype, dims...)

    println("mapslices on Array")
    @time out = mapslices(f->quantile(f, 0.2), data, 1)
    @time out = mapslices(f->quantile(f, 0.2), data, 1)

    println("loop over Array using CartesianRange")
    out = Array(Float32, size(data)[2:end])
    @time loop_over_array!(out, data)
    @time loop_over_array!(out, data)
end

function loop_over_array!(out::Array, data::Array)
    for I in CartesianRange(size(out))
        # explicit indexing, since [:, I...] didn't work
        out[I] = quantile(data[:, I[1], I[2], I[3]], 0.2)
    end
end

function benchmark_shared_array(dtype, dims)
    data = SharedArray(dtype, (dims...), pids=workers())

    println("@parallel loop over SharedArray")
    out = SharedArray(Float32, size(data)[2:end], pids=workers())
    @time parallel_loop_over_shared_array!(out, data)
    @time parallel_loop_over_shared_array!(out, data)
end

function parallel_loop_over_shared_array!(out::SharedArray, data::SharedArray)
    # @parallel for I in CartesianRange(size(out)) does not seem to work
    @sync @parallel for i in 1:size(out)[end]
        for I in CartesianRange(size(out)[1:end-1])
            out[I[1], I[2], i] = quantile(data[:, I[1], I[2], i], 0.2)
        end
    end
end

function benchmark_distributed_array(dtype, dims)
    data = drand(dtype, (dims...), workers(),
        [i == length(dims) ? nworkers() : 1 for i in 1:length(dims)])

    println("@parallel loop over DArray")
    out = dzeros(Float32, size(data)[2:end], workers(),
        [i ==  ndims(data) ? nworkers() : 1 for i in 2:ndims(data)])
    @time parallel_loop_over_distributed_array!(out, data)
    @time parallel_loop_over_distributed_array!(out, data)
end

function parallel_loop_over_distributed_array!(out::DArray, data::DArray)
    @sync for pid in workers()
        @spawnat pid begin
            inchunk = localpart(data)
            outchunk = localpart(out)
            for I in CartesianRange(size(outchunk))
                outchunk[I] = quantile(inchunk[:, I[1], I[2], I[3]], 0.2)
            end
        end
    end
end

function benchmark_all(dtype, dims)
    benchmark_array(dtype, dims)
    benchmark_shared_array(dtype, dims)
    benchmark_distributed_array(dtype, dims)
end

const dtype = Int
const dims = [128,256,256,64]
benchmark_all(dtype, dims)

0 个答案:

没有答案