8x8 float32_t使用ARM NEON的矩阵乘法速度较慢?

时间:2017-11-22 02:32:45

标签: arm matrix-multiplication simd neon

我想知道什么内在函数使SIMD比正常的矩阵乘法更慢,我该怎么做才能使用SIMD更快地乘以大矩阵。在这里,我们有matrixA[8][8]matrixB[8][8]和结果matrixC[8][8]。因为float32_t的最大元素数是4,所以我做了2个vmul和vadd,它们似乎没有得到优化。我在ARMv7-A Cortex A8上工作。

void matrix_mult_neon (void)
{
    int i;

    float32x4x2_t vectB1, vectB2, vectB3, vectB4, vectB5, vectB6, vectB7, vectB8;
    vectB1 = vld2q_f32(matrixB[0]);
    vectB2 = vld2q_f32(matrixB[1]);
    vectB3 = vld2q_f32(matrixB[2]);
    vectB4 = vld2q_f32(matrixB[3]);
    vectB5 = vld2q_f32(matrixB[4]);
    vectB6 = vld2q_f32(matrixB[5]);
    vectB7 = vld2q_f32(matrixB[6]);
    vectB8 = vld2q_f32(matrixB[7]);


    float32x4x2_t vectT1, vectT2, vectT3, vectT4, vectT5, vectT6, vectT7, vectT8; 
    for (i = 0; i < 8; i++)
    {
        vectT1.val[0] = vmulq_n_f32(vectB1.val[0], matrixA[i][0]);
        vectT1.val[1] = vmulq_n_f32(vectB1.val[1], matrixA[i][0]);
        vectT2.val[0] = vmulq_n_f32(vectB2.val[0], matrixA[i][1]);
        vectT2.val[1] = vmulq_n_f32(vectB2.val[1], matrixA[i][1]);
        vectT3.val[0] = vmulq_n_f32(vectB3.val[0], matrixA[i][2]);
        vectT3.val[1] = vmulq_n_f32(vectB3.val[1], matrixA[i][2]);
        vectT4.val[0] = vmulq_n_f32(vectB4.val[0], matrixA[i][3]);
        vectT4.val[1] = vmulq_n_f32(vectB4.val[1], matrixA[i][3]);
        vectT5.val[0] = vmulq_n_f32(vectB5.val[0], matrixA[i][4]);
        vectT5.val[1] = vmulq_n_f32(vectB5.val[1], matrixA[i][4]);
        vectT6.val[0] = vmulq_n_f32(vectB6.val[0], matrixA[i][5]);
        vectT6.val[1] = vmulq_n_f32(vectB6.val[1], matrixA[i][5]);
        vectT7.val[0] = vmulq_n_f32(vectB7.val[0], matrixA[i][6]);
        vectT7.val[1] = vmulq_n_f32(vectB7.val[1], matrixA[i][6]);
        vectT8.val[0] = vmulq_n_f32(vectB8.val[0], matrixA[i][7]);
        vectT8.val[1] = vmulq_n_f32(vectB8.val[1], matrixA[i][7]);


        vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT2.val[0]);
        vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT3.val[0]);
        vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT4.val[0]);
        vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT5.val[0]);
        vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT6.val[0]);
        vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT7.val[0]);
        vectT1.val[0] = vaddq_f32(vectT1.val[0], vectT8.val[0]);

        vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT2.val[1]);
        vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT3.val[1]);
        vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT4.val[1]);
        vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT5.val[1]);
        vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT6.val[1]);
        vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT7.val[1]);
        vectT1.val[1] = vaddq_f32(vectT1.val[1], vectT8.val[1]);

        vst2q_f32(matrixC_neon[i], vectT1);
    }
}

我的常规矩阵乘法函数:

void matrix_mult (void)
{
    float tempProduct;
    int i, j, k;

    for (i = 0; i < 8; i++)
    {
        for (j = 0; j < 8; j++)
        {
            tempProduct = 0;
            for (k = 0; k < 8; k++)
            {
                tempProduct = tempProduct + matrixA[i][k] * matrixB[k][j];
            }
            matrixC[i][j] = tempProduct;
        }
    }
}

我在库gettimeofday()中使用<sys/time.h>函数来计算以纳秒为单位的时间。

1 个答案:

答案 0 :(得分:4)

问题:

  • aarch32有一个大小为256字节的NEON寄存器库
  • 8x8浮点矩阵已经是256字节大,你需要其中的三个。 (768)
  • 您必须“垂直”读取矩阵B,这意味着在物理上不可能以“流式”方式实现最大数据位置。
  • 你做矢量标量乘法,它的总和是矢量 - 矢量乘法的四倍。
  • 您通过VFP加载Mat A. VFP Cortex-A8上的NEON特别慢,VFP&lt; - &gt; static __always_inline float32x2_t dotProduct(float32x4x2_t input1, float32x4x2_t input2) { float32x2_t d0, d1; float32x4_t q0; input1.val[0] = vmulq_f32(input1.val[0], input2.val[0]); input1.val[1] = vmulq_f32(input1.val[1], input2.val[1]); q0 = vaddq_f32(input1.val[0], input1.val[1]); d0 = vget_low_f32(q0); d1 = vget_high_f32(q0); d0 = vpadd_f32(d0, d1); d0 = vpadd_f32(d0, d1); return d0; } void matMulF_neon(float *pDst, float *pMatA, float *pMatB) { float32x4x4_t line01, line23, line45, line67; float32x4x2_t b[8], *pA, *pB, temp; float32x2x4_t result; uint32_t i; // vld4 for easier transpose line01 = vld4q_f32(pMatB++); line23 = vld4q_f32(pMatB++); line45 = vld4q_f32(pMatB++); line67 = vld4q_f32(pMatB); // transpose MatB vuzpq_f32(line01.val[0], line45.val[0]); vuzpq_f32(line01.val[1], line45.val[1]); vuzpq_f32(line01.val[2], line45.val[2]); vuzpq_f32(line01.val[3], line45.val[3]); vuzpq_f32(line23.val[0], line67.val[0]); vuzpq_f32(line23.val[1], line67.val[1]); vuzpq_f32(line23.val[2], line67.val[2]); vuzpq_f32(line23.val[3], line67.val[3]); // store MatB to stack b[0].val[0] = line01.val[0]; b[0].val[1] = line01.val[1]; b[1].val[0] = line01.val[2]; b[1].val[1] = line01.val[3]; b[2].val[0] = line23.val[0]; b[2].val[1] = line23.val[1]; b[3].val[0] = line23.val[2]; b[3].val[1] = line23.val[3]; b[4].val[0] = line45.val[0]; b[4].val[1] = line45.val[1]; b[5].val[0] = line45.val[2]; b[5].val[1] = line45.val[3]; b[6].val[0] = line67.val[0]; b[6].val[1] = line67.val[1]; b[7].val[0] = line67.val[2]; b[7].val[1] = line67.val[3]; pA = (float32x4x2_t *) pMatA; i = 8; do { // just the right amount of data for aarch32 NEON register bank size pB = b; temp = *pA++; result.val[0] = dotProduct(*pB++, temp); result.val[1] = dotProduct(*pB++, temp); result.val[2] = dotProduct(*pB++, temp); result.val[3] = dotProduct(*pB++, temp); vst4_lane_f32(pDst++, result, 0); result.val[0] = dotProduct(*pB++, temp); result.val[1] = dotProduct(*pB++, temp); result.val[2] = dotProduct(*pB++, temp); result.val[3] = dotProduct(*pB, temp); vst4_lane_f32(pDst++, result, 0); } while (--i); } 转换开销。与自动矢量化不同,内在功能与您告诉它的方式完全相同。你给了错误的指示。

解决方案:

我们转置矩阵B并逐行进行点积数学。

我希望下面的代码适合您,如果性能至关重要,请考虑编写汇编,因为编译器在NEON性能方面不是很值得信赖,即使在内在函数中也是如此。

if [ $# -eq 0 -o $# -gt 2 ]
                         ^

///////////////////////////编辑

我检查了反汇编,生成的代码是FUBAR。 (Linaro GCC 7.1.1)

我会去装配路线。在内在函数中编写NEON代码纯粹是浪费时间IMO。