我是SIMD编程的初学者。我想按如下方式处理我的数据:
考虑我有4个simd变量(__m128i
),数据如下:
__m128i a = {a1, a2, a3, a4}
__m128i b = {b1, b2, b3, b4}
__m128i c = {c1, c2, c3, c4}
__m128i d = {d1, d2, d3, d4}
现在我想按如下方式初始化Xi:
__m128i x1 = {a1, b1, c1, d1}
__m128i x2 = {a2, b2, c2, d2}
__m128i x3 = {a3, b3, c3, d3}
__m128i x4 = {a4, b4, c4, d4}
有人可以告诉我如何才能有效地做到这一点?
答案 0 :(得分:3)
我假设您的问题中存在拼写错误,并且您确实想要进行4x4转置。如果是这样,那么你可以用8条指令进行4x4转置,如下所示:
#include "emmintrin.h"
inline void Transpose_4_4(
__m128i &v0, // a1, a2, a3, a4 => a1, b1, c1, d1
__m128i &v1, // b1, b2, b3, b4 => a2, b2, c2, d2
__m128i &v2, // c1, c2, c3, c4 => a3, b3, c3, d3
__m128i &v3) // d1, d2, d3, d4 => a4, b4, c4, d4
{
__m128i w0 = _mm_unpacklo_epi32(v0, v1);
__m128i w1 = _mm_unpackhi_epi32(v0, v1);
__m128i w2 = _mm_unpacklo_epi32(v2, v3);
__m128i w3 = _mm_unpackhi_epi32(v2, v3);
v0 = _mm_unpacklo_epi64(w0, w2);
v1 = _mm_unpackhi_epi64(w0, w2);
v2 = _mm_unpacklo_epi64(w1, w3);
v3 = _mm_unpackhi_epi64(w1, w3);
}
演示:
//
// tranpose_4_4.cpp
//
#include <stdio.h>
#include <emmintrin.h>
inline void Transpose_4_4(
__m128i &v0, // a1, a2, a3, a4 => a1, b1, c1, d1
__m128i &v1, // b1, b2, b3, b4 => a2, b2, c2, d2
__m128i &v2, // c1, c2, c3, c4 => a3, b3, c3, d3
__m128i &v3) // d1, d2, d3, d4 => a4, b4, c4, d4
{
__m128i w0 = _mm_unpacklo_epi32(v0, v1);
__m128i w1 = _mm_unpackhi_epi32(v0, v1);
__m128i w2 = _mm_unpacklo_epi32(v2, v3);
__m128i w3 = _mm_unpackhi_epi32(v2, v3);
v0 = _mm_unpacklo_epi64(w0, w2);
v1 = _mm_unpackhi_epi64(w0, w2);
v2 = _mm_unpacklo_epi64(w1, w3);
v3 = _mm_unpackhi_epi64(w1, w3);
}
int main(void)
{
int32_t buff[4][4] __attribute__ ((aligned(16)));
int i, j;
int k = 0;
// init buff
for (i = 0; i < 4; ++i)
{
for (j = 0; j < 4; ++j)
{
buff[i][j] = k++;
}
}
// print buff
printf("\nBEFORE:\n");
for (i = 0; i < 4; ++i)
{
for (j = 0; j < 4; ++j)
{
printf("%4d", buff[i][j]);
}
printf("\n");
}
// transpose
Transpose_4_4(*(__m128i *)buff[0], *(__m128i *)buff[1], *(__m128i *)buff[2], *(__m128i *)buff[3]);
// print buff
printf("\nAFTER:\n");
for (i = 0; i < 4; ++i)
{
for (j = 0; j < 4; ++j)
{
printf("%4d", buff[i][j]);
}
printf("\n");
}
return 0;
}
编译并运行:
$ g++ -Wall -msse3 transpose_4_4.cpp && ./a.out
BEFORE:
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
AFTER:
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15
$