Question

对于性能至关重要的代码，我需要重新排序许多大型数组（d1 = 256, d2 = 256, d3 = 16）的尺寸。这种类型的操作类似于矩阵转置，因此我决定给xtensor一个机会。令人惊讶的是，xtensor代码的速度几乎是我的手写天真循环速度的两倍。我怎样才能加快速度？请注意，我无法复制数据。

template<typename TF>
inline void reorder123x321_ref(
        TF* __restrict__ out, const TF* __restrict__ in,
        const int d1, const int d2, const int d3)
{
    const int jj_in = d1;
    const int kk_in = d1*d2;

    const int ii_out = d3*d2;
    const int jj_out = d3;

    for (int i=0; i<d1; ++i)
        for (int j=0; j<d2; ++j)
            #pragma GCC ivdep
            for (int k=0; k<d3; ++k)
            {
                const int ijk_in  = i + j*jj_in  + k*kk_in ;
                const int ijk_out = k + j*jj_out + i*ii_out;
                out[ijk_out] = in[ijk_in];
            }
}

template<typename TF>
inline void reorder123x321_test(
        TF* out, const TF* in,
        const size_t d1, const size_t d2, const size_t d3)
{
    const size_t size = d1*d2*d3;
    const std::array<size_t, 3> in_shape = { d1, d2, d3 };
    const std::array<size_t, 3> out_shape = { d3, d2, d1 };

    const auto a_in = xt::adapt<xt::layout_type::column_major>(in, size, xt::no_ownership(), in_shape);
    auto a_out = xt::adapt<xt::layout_type::column_major>(out, size, xt::no_ownership(), out_shape);

    xt::noalias(a_out) = xt::transpose(a_out);
}

使用xtensor进行尺寸重新排序非常慢

0 个答案: