Question

我正在尝试将一些R代码移植到Julia; 基本上我在Julia中重写了以下R代码：

Components are isolated entities that consume data through their
interface, react to data changes that flow via data binding, and 
possibly send up named actions.

正如人们可以注意到，当我从一个核心转到11个核心时，我的速度会有显着提高。现在我想在朱莉娅做同样的事情：

library(parallel)

eps_1<-rnorm(1000000)
eps_2<-rnorm(1000000)

large_matrix<-ifelse(cbind(eps_1,eps_2)>0,1,0)
matrix_to_compare = expand.grid(c(0,1),c(0,1))
indices<-seq(1,1000000,4)
large_matrix<-lapply(indices,function(i)(large_matrix[i:(i+3),]))

function_compare<-function(x){
  which((rowSums(x==matrix_to_compare)==2) %in% TRUE)
}

> system.time(lapply(large_matrix,function_compare))
   user  system elapsed 
 38.812   0.024  38.828 
> system.time(mclapply(large_matrix,function_compare,mc.cores=11))
   user  system elapsed 
 63.128   1.648   6.108

正如人们可以注意到我没有使用pmap加快速度。也许有人可以提出替代方案。

Answer 1

我认为这里的一些问题是@parallel和@pmap并不总能很好地处理来自工作者的数据。因此，它们往往在您执行的内容根本不需要非常多的数据移动的情况下工作得最好。我也怀疑可能会有一些事情可以改善他们的表现，但我不确定细节。

对于需要更多数据移动的情况，最好坚持使用直接调用worker上的函数的选项，然后使用这些函数访问这些worker的内存空间中的对象。我在下面给出一个例子，它使用多个工作人员来加速你的功能。它可能使用最简单的选项，@everywhere，但@spawn，remotecall()等也值得考虑，具体取决于您的情况。

addprocs(11);

using Distributions;
@everywhere using Iterators;
d = Normal();

eps_1 = rand(d,1000000);
eps_2 = rand(d,1000000);

#Create a large matrix:
large_matrix = hcat(eps_1,eps_2).>=0;
indices = collect(1:4:1000000);

#Split large matrix:
large_matrix = [large_matrix[i:(i+3),:] for i in indices];

large_matrix = convert(Array{BitArray}, large_matrix);

function sendto(p::Int; args...)
    for (nm, val) in args
        @spawnat(p, eval(Main, Expr(:(=), nm, val)))
    end
end

getfrom(p::Int, nm::Symbol; mod=Main) = fetch(@spawnat(p, getfield(mod, nm)))

@everywhere function function_split(x::BitArray)
    matrix_to_compare = transpose(reinterpret(Int,collect(product([0,1],[0,1])),(2,4)));
    matrix_to_compare = matrix_to_compare.>0;
    find(sum(x.==matrix_to_compare,2).==2)
end


function distribute_data(X::Array, WorkerName::Symbol)
    size_per_worker = floor(Int,size(X,1) / nworkers())
    StartIdx = 1
    EndIdx = size_per_worker
    for (idx, pid) in enumerate(workers())
        if idx == nworkers()
            EndIdx = size(X,1)
        end
        @spawnat(pid, eval(Main, Expr(:(=), WorkerName, X[StartIdx:EndIdx])))
        StartIdx = EndIdx + 1
        EndIdx = EndIdx + size_per_worker - 1
    end
end

distribute_data(large_matrix, :large_matrix)


function parallel_split()
    @everywhere begin
        if myid() != 1
            result = map(function_split,large_matrix );
        end
    end
    results = cell(nworkers())
    for (idx, pid) in enumerate(workers())
        results[idx] = getfrom(pid, :result)
    end
    vcat(results...)
end

## results given after running once to compile
@time a = map(function_split,large_matrix); ## 6.499737 seconds (22.00 M allocations: 2.899 GB, 13.99% gc time)
@time b = parallel_split();  ## 1.097586 seconds (1.50 M allocations: 64.508 MB, 3.28% gc time)

julia> a == b
true

注意：即使这样，多个进程的加速也不是很完美。但是，这是可以预料到的，因为由于您的功能，仍然会返回适量的数据，并且数据必须移动，需要时间。

P.S。有关此处使用的sendto和getfrom函数的更多信息，请参阅此帖子（Julia: How to copy data to another processor in Julia）或此包（https://github.com/ChrisRackauckas/ParallelDataTransfer.jl）。

朱莉娅pmap表现

1 个答案: