Question

在朱莉娅，最快的方法是什么：

其中是时间t处变量的维列向量。

在Julia代码中，一个选项是：

A = zeros(n,n);
for j=1:T
    A = A + Y(j,:)'*Y(j,:);
end

其中

Y = [y_1' 
    ... 
    y_T']`

是(Txn)矩阵。

但是，有更快的方法吗？感谢。

Answer 1

为了比较，我尝试了几个用于计算A矩阵的代码（我希望是OP想要的......），包括内置矩阵乘法，BLAS.ger！和显式循环：

print_(x) = print(rpad(x,12))

# built-in vector * vector'
function perf0v( n, T, Y )
    print_("perf0v")
    out = zeros(n,n)
    for t = 1 : T
        out += slice( Y, :,t ) * slice( Y, :,t )'
    end
    return out
end

# built-in matrix * matrix'
function perf0m( n, T, Y )
    print_("perf0m")
    out = Y * Y'
    return out
end

# BLAS.ger!
function perf1( n, T, Y )
    print_("perf1")
    out = zeros(n,n)
    for t = 1 : T
        BLAS.ger!( 1.0, Y[ :,t ], Y[ :,t ], out )
    end
    return out
end

# BLAS.ger! with sub
function perf1sub( n, T, Y )
    print_("perf1sub")
    out = zeros(n,n)
    for t = 1 : T
        BLAS.ger!( 1.0, sub( Y, :,t ), sub( Y, :,t ), out )
    end
    return out
end

# explicit loop
function perf2( n, T, Y )
    print_("perf2")
    out = zeros(n,n)
    for t  = 1 : T,
        i2 = 1 : n,
        i1 = 1 : n
        out[ i1, i2 ] += Y[ i1, t ] * Y[ i2, t ]
    end
    return out
end

# explicit loop with simd
function perf2simd( n, T, Y )
    print_("perf2simd")
    out = zeros(n,n)
    for i2 = 1 : n,
        i1 = 1 : n
        @simd for t = 1 : T
            out[ i1, i2 ] += Y[ i1, t ] * Y[ i2, t ]
        end
    end
    return out
end

# transposed perf2
function perf2tr( n, T, Yt )
    print_("perf2tr")
    out = zeros(n,n)
    for t  = 1 : T,
        i2 = 1 : n,
        i1 = 1 : n
        out[ i1, i2 ] += Yt[ t, i1 ] * Yt[ t, i2 ]
    end
    return out
end

# transposed perf2simd
function perf2simdtr( n, T, Yt )
    print_("perf2simdtr")
    out = zeros(n,n)
    for i2 = 1 : n,
        i1 = 1 : n
        @simd for t = 1 : T
            out[ i1, i2 ] += Yt[ t, i1 ] * Yt[ t, i2 ]
        end
    end
    return out
end

#.........................................................

n = 100
T = 1000
@show n, T

Y = rand( n, T )
Yt = copy( Y' )

out = Dict()

for loop = 1:2
    println("loop = ", loop)

    for fn in [ perf0v, perf0m, perf1, perf1sub, perf2, perf2simd ]
        @time out[ fn ] = fn( n, T, Y )
    end
    for fn in [ perf2tr, perf2simdtr ]
        @time out[ fn ] = fn( n, T, Yt )
    end
end

# Check
error = 0.0
for k1 in keys( out ),
    k2 in keys( out )
    @assert sumabs( out[ k1 ] ) > 0.0
    @assert sumabs( out[ k2 ] ) > 0.0
    error += sumabs( out[ k1 ] - out[ k2 ] )
end
@show error

使用julia -O --check-bounds=no test.jl（版本0.4.5）获得的结果是：

(n,T) = (100,1000)
loop = 2
perf0v        0.056345 seconds (15.04 k allocations: 154.803 MB, 31.66% gc time)
perf0m        0.000785 seconds (7 allocations: 78.406 KB)
perf1         0.155182 seconds (5.96 k allocations: 1.846 MB)
perf1sub      0.155089 seconds (8.01 k allocations: 359.625 KB)
perf2         0.011192 seconds (6 allocations: 78.375 KB)
perf2simd     0.016677 seconds (6 allocations: 78.375 KB)
perf2tr       0.011698 seconds (6 allocations: 78.375 KB)
perf2simdtr   0.009682 seconds (6 allocations: 78.375 KB)

和n＆amp;的某些不同值T：

(n,T) = (1000,100)
loop = 2
perf0v        0.610885 seconds (2.01 k allocations: 1.499 GB, 25.11% gc time)
perf0m        0.008866 seconds (9 allocations: 7.630 MB)
perf1         0.182409 seconds (606 allocations: 9.177 MB)
perf1sub      0.180720 seconds (806 allocations: 7.657 MB, 0.67% gc time)
perf2         0.104961 seconds (6 allocations: 7.630 MB)
perf2simd     0.119964 seconds (6 allocations: 7.630 MB)
perf2tr       0.137186 seconds (6 allocations: 7.630 MB)
perf2simdtr   0.103878 seconds (6 allocations: 7.630 MB)

(n,T) = (2000,100)
loop = 2
perf0v        2.514622 seconds (2.01 k allocations: 5.993 GB, 24.38% gc time)
perf0m        0.035801 seconds (9 allocations: 30.518 MB)
perf1         0.473479 seconds (606 allocations: 33.591 MB, 0.04% gc time)
perf1sub      0.475796 seconds (806 allocations: 30.545 MB, 0.95% gc time)
perf2         0.422808 seconds (6 allocations: 30.518 MB)
perf2simd     0.488539 seconds (6 allocations: 30.518 MB)
perf2tr       0.554685 seconds (6 allocations: 30.518 MB)
perf2simdtr   0.400741 seconds (6 allocations: 30.518 MB)

(n,T) = (3000,100)
loop = 2
perf0v        5.444797 seconds (2.21 k allocations: 13.483 GB, 20.77% gc time)
perf0m        0.080458 seconds (9 allocations: 68.665 MB)
perf1         0.927325 seconds (806 allocations: 73.261 MB, 0.02% gc time)
perf1sub      0.926690 seconds (806 allocations: 68.692 MB, 0.51% gc time)
perf2         0.958189 seconds (6 allocations: 68.665 MB)
perf2simd     1.067098 seconds (6 allocations: 68.665 MB)
perf2tr       1.765001 seconds (6 allocations: 68.665 MB)
perf2simdtr   0.902838 seconds (6 allocations: 68.665 MB)

嗯，所以内置矩阵*矩阵（Y * Y＆＃39;）最快。似乎BLAS gemm最后被调用（来自@less Y * Y＆＃39;的输出）。

Answer 2

如果您事先知道y_t的组件，那么最简单，最简单，最简单的方法就是：

A = Y*Y'

y_t的不同值作为列存储在矩阵Y中。

如果您事先不知道y_t的组件，可以使用BLAS：

n = 100;
t = 1000;
Y = rand(n,t);

outer_sum = zeros(n,n);

for tdx = 1:t
    BLAS.ger!(1.0, Y[:,tdx], Y[:,tdx], outer_sum)
end

如果您是BLAS的新手并希望在此处帮助解释此函数中的参数，请参阅this post（有关类似示例）。

此处的关键之一是将y_t向量存储为列，而不是Y的行，因为访问列很多比访问行更快。有关详细信息，请参阅Julia performance tips。

对于第二个选项

更新（事先不知道Y的组件是什么，BLAS有时但并不总是最快。 factor是您正在使用的向量的大小。调用BLAS会产生一定的开销，因此仅在某些设置中值得.Julia的原生矩阵乘法将自动选择是否使用BLAS，并且通常会执行但是，如果你提前知道你正在处理BLAS最优的情况，那么你可以通过指定它来保存Julia优化器的一些工作（从而加速你的代码）。时间。

请参阅下面roygvib的精彩回应。它提供了大量创造性和指导性的方法来计算这些点积的总和。在某些情况下，许多人会比BLAS更快。从roygvib提供的时间试验来看，盈亏平衡点看起来像是n = 3000。

Answer 3

为了完成，这是另一种矢量化方法：

假设Y如下：

julia> Y = rand(1:10, 10,5)
10×5 Array{Int64,2}:
 2   1   6   2  10
 8   2   6   8   2
 2  10  10   4   6
 5   9   8   5   1
 5   4   9   9   4
 4   6   3   4   8
 2   9   2   8   1
 6   8   5  10   2
 1   7  10   6   9
 8   7  10  10   8

julia> Y = reshape(Y, 10,5,1); # add a singular 3rd dimension, so we can 
                               # be allowed to shuffle the dimensions

我们的想法是创建一个在维度1和3中定义的数组，并且只有一列，并且您可以通过在维度2和3中定义的数组进行数组乘以，但只有一行。您的“时间”变量沿着维度3变化。这实际上导致每个时间步长的单个kronecker乘积，沿时间（即第3）维度连接。

julia> KroneckerProducts = permutedims(Y, [2,3,1]) .* permutedims(Y, [3,2,1]);

现在我不清楚你的最终结果是否是一个“nxn”矩阵，由每个'kronecker'位置的所有时间总和产生

julia> sum(KroneckerProducts, 3)
5×5×1 Array{Int64,3}:
[:, :, 1] =
 243  256  301  324  192
 256  481  442  427  291
 301  442  555  459  382
 324  427  459  506  295
 192  291  382  295  371

或简单地说是那个大规模3D数组中所有元素的总和

julia> sum(KroneckerProducts)
8894

选择您喜欢的毒药：p

我不确定这会比Michael上面的方法更快，因为permutedims步骤可能是昂贵的，对于非常大的阵列，它实际上可能是瓶颈（但我不知道它是如何实现的）在朱莉娅......也许它不是！），所以它可能不一定比每个时间步的迭代循环更好，即使它是“矢量化代码”。您可以尝试这两种方法，亲自了解特定阵列的最快速度！

什么是计算外部产品总和的最快方法[Julia]

3 个答案: