英特尔已经包含__MM_TRANPOSE4_PS来转置4x4矢量矩阵。我想用__m256d做相同的事情。但是,我似乎无法弄清楚如何以同样的方式获得_mm256_shuffle_pd。
_MM_TRANSPOSE4_PS代码
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) { \
__m128 tmp3, tmp2, tmp1, tmp0; \
\
tmp0 = _mm_shuffle_ps((row0), (row1), 0x44); \
tmp2 = _mm_shuffle_ps((row0), (row1), 0xEE); \
tmp1 = _mm_shuffle_ps((row2), (row3), 0x44); \
tmp3 = _mm_shuffle_ps((row2), (row3), 0xEE); \
\
(row0) = _mm_shuffle_ps(tmp0, tmp1, 0x88); \
(row1) = _mm_shuffle_ps(tmp0, tmp1, 0xDD); \
(row2) = _mm_shuffle_ps(tmp2, tmp3, 0x88); \
(row3) = _mm_shuffle_ps(tmp2, tmp3, 0xDD); \
}
我在循环中尝试_MM_TRANSPOSE4_PD我需要它
for (int copy = i; copy < m2.size();)
{
__m256d row0 = _mm256_load_pd(m2data + copy);
copy += m2.col();
__m256d row1 = _mm256_load_pd(m2data + copy);
copy += m2.col();
__m256d row2 = _mm256_load_pd(m2data + copy);
copy += m2.col();
__m256d row3 = _mm256_load_pd(m2data + copy);
copy += m2.col();
__m256d tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm256_shuffle_pd(row0,row1, 0x44);
tmp2 = _mm256_shuffle_pd(row0,row1, 0xEE);
tmp1 = _mm256_shuffle_pd(row2,row3, 0x44);
tmp3 = _mm256_shuffle_pd(row2,row3, 0xEE);
row0 = _mm256_shuffle_pd(tmp0, tmp1, 0x88);
row1 = _mm256_shuffle_pd(tmp0, tmp1, 0xDD);
row2 = _mm256_shuffle_pd(tmp2, tmp3, 0x88);
row3 = _mm256_shuffle_pd(tmp2, tmp3, 0xDD);
_mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row0);
_mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row1);
_mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row2);
_mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row3);
}
答案 0 :(得分:5)
这是我发现的解决方案的宏等效物。
#define _MM_TRANSPOSE4_PD(row0,row1,row2,row3) \
{ \
double4 tmp3, tmp2, tmp1, tmp0; \
\
tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \
tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \
tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \
tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \
\
(row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \
(row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \
(row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \
(row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \
}