使用SIMD内在函数进行高效的行列转换

时间:2014-02-11 19:52:08

标签: matrix x86 sse simd avx

我是SIMD编程的初学者。我想按如下方式处理我的数据:

考虑我有4个simd变量(__m128i),数据如下:

__m128i a = {a1, a2, a3, a4}
__m128i b = {b1, b2, b3, b4}
__m128i c = {c1, c2, c3, c4}
__m128i d = {d1, d2, d3, d4}

现在我想按如下方式初始化Xi:

__m128i x1 = {a1, b1, c1, d1}
__m128i x2 = {a2, b2, c2, d2}
__m128i x3 = {a3, b3, c3, d3}
__m128i x4 = {a4, b4, c4, d4}

有人可以告诉我如何才能有效地做到这一点?

1 个答案:

答案 0 :(得分:3)

我假设您的问题中存在拼写错误,并且您确实想要进行4x4转置。如果是这样,那么你可以用8条指令进行4x4转置,如下所示:

#include "emmintrin.h"

inline void Transpose_4_4(
    __m128i &v0,               // a1, a2, a3, a4 => a1, b1, c1, d1
    __m128i &v1,               // b1, b2, b3, b4 => a2, b2, c2, d2
    __m128i &v2,               // c1, c2, c3, c4 => a3, b3, c3, d3
    __m128i &v3)               // d1, d2, d3, d4 => a4, b4, c4, d4
{
    __m128i w0 = _mm_unpacklo_epi32(v0, v1);
    __m128i w1 = _mm_unpackhi_epi32(v0, v1);
    __m128i w2 = _mm_unpacklo_epi32(v2, v3);
    __m128i w3 = _mm_unpackhi_epi32(v2, v3);
    v0 = _mm_unpacklo_epi64(w0, w2);
    v1 = _mm_unpackhi_epi64(w0, w2);
    v2 = _mm_unpacklo_epi64(w1, w3);
    v3 = _mm_unpackhi_epi64(w1, w3);
}

演示:

//
// tranpose_4_4.cpp
//

#include <stdio.h>
#include <emmintrin.h>

inline void Transpose_4_4(
    __m128i &v0,               // a1, a2, a3, a4 => a1, b1, c1, d1
    __m128i &v1,               // b1, b2, b3, b4 => a2, b2, c2, d2
    __m128i &v2,               // c1, c2, c3, c4 => a3, b3, c3, d3
    __m128i &v3)               // d1, d2, d3, d4 => a4, b4, c4, d4
{
    __m128i w0 = _mm_unpacklo_epi32(v0, v1);
    __m128i w1 = _mm_unpackhi_epi32(v0, v1);
    __m128i w2 = _mm_unpacklo_epi32(v2, v3);
    __m128i w3 = _mm_unpackhi_epi32(v2, v3);
    v0 = _mm_unpacklo_epi64(w0, w2);
    v1 = _mm_unpackhi_epi64(w0, w2);
    v2 = _mm_unpacklo_epi64(w1, w3);
    v3 = _mm_unpackhi_epi64(w1, w3);
}

int main(void)
{
    int32_t buff[4][4] __attribute__ ((aligned(16)));
    int i, j;
    int k = 0;

    // init buff
    for (i = 0; i < 4; ++i)
    {
        for (j = 0; j < 4; ++j)
        {
            buff[i][j] = k++;
        }
    }

    // print buff
    printf("\nBEFORE:\n");
    for (i = 0; i < 4; ++i)
    {
        for (j = 0; j < 4; ++j)
        {
            printf("%4d", buff[i][j]);
        }
        printf("\n");
    }

    // transpose
    Transpose_4_4(*(__m128i *)buff[0], *(__m128i *)buff[1], *(__m128i *)buff[2], *(__m128i *)buff[3]);

    // print buff
    printf("\nAFTER:\n");
    for (i = 0; i < 4; ++i)
    {
        for (j = 0; j < 4; ++j)
        {
            printf("%4d", buff[i][j]);
        }
        printf("\n");
    }

    return 0;
}

编译并运行:

$ g++ -Wall -msse3 transpose_4_4.cpp && ./a.out 

BEFORE:
   0   1   2   3
   4   5   6   7
   8   9  10  11
  12  13  14  15

AFTER:
   0   4   8  12
   1   5   9  13
   2   6  10  14
   3   7  11  15
$