所以我想在C ++中使用SIMD指令来比较uint32_t数组中的值,并将值存储回新数组中。 它或多或少都可以正常工作,但是我仍然使用4个if子句来确定我在SIMD指令后写的值是否回写了值。
有没有办法用SIMD指令做到这一点? 函数allocateAlignedBuffer执行名称所说的并且正常工作。
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t* cnt) {
uint32_t numcnt = 4;
uint32_t * resArr = allocateAlignedBuffer<uint32_t>(num, true);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set_epi32(10,10,10,10);
for (int i = 0; i < num; i+=4) {
__m128i positions = _mm_set_epi32(i+3,i+2,i+1,i);
__m128i vec = _mm_load_si128 ( reinterpret_cast<const __m128i*> ( (&arr[i]) ) );
__m128i simdAnd2 = _mm_cmpge_ps(vec, comp2);
int comp = _mm_movemask_epi8 (simdAnd2);
if (comp == 0x0000) {
//std::cout << "nothing found\n";
continue;
}
else if (comp < 65535) {
if ( ((uint32_t *) &simdAnd2)[0] ){
std::cout << "first byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[0];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[1]){
std::cout << "second byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[1];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[2]){
std::cout << "3rd byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[2];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[3]){
std::cout << "4th byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[3];
resPos++;
*cnt++;
}
}
else { //all elements equal
resPos[0] = ((uint32_t *) &positions)[0];
resPos[1] = ((uint32_t *) &positions)[1];
resPos[2] = ((uint32_t *) &positions)[2];
resPos[3] = ((uint32_t *) &positions)[3];
resPos += numcnt;
*cnt += numcnt;
}
}
std::cout << "cnt "<<*cnt<<"\n";
return resArr;
}
我相信也可能有很多优化。
答案 0 :(得分:1)
使用shuffle的另一种变体:
__m128i g_shuffles[16] =
{
_mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0),
_mm_setr_epi8(12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0),
_mm_setr_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
};
uint32_t g_steps[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
uint32_t * testFunc2(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i threshold = _mm_set1_epi32(10 - 1);
__m128i positions = _mm_setr_epi32(0, 1, 2, 3);
__m128i _4 = _mm_set1_epi32(4);
__m128i _1 = _mm_set1_epi32(1);
__m128i _cnt = _mm_setzero_si128();
for (int i = 0; i < num; i += 4)
{
__m128i _arr = _mm_loadu_si128((__m128i*)(arr + i));
__m128i comparemask = _mm_cmpgt_epi32(_arr, threshold);
_cnt = _mm_add_epi32(_cnt, _mm_and_si128(comparemask, _1));
int index = _mm_movemask_ps(_mm_castsi128_ps(comparemask));
__m128i storePositions = _mm_shuffle_epi8(positions, g_shuffles[index]);
_mm_storeu_si128((__m128i*)resPos, storePositions);
resPos += g_steps[index];
positions = _mm_add_epi32(positions, _4);
}
uint32_t cnts[4];
_mm_storeu_si128((__m128i*)cnts, _cnt);
*cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3];
std::cout << "cnt " << *cnt << "\n";
return resArr;
}
答案 1 :(得分:0)
我做了一些改变,必须提高绩效:
#include <immintrin.h>
#include <memory.h>
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
uint32_t numcnt = 4;
uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set1_epi32(10);
__m128i positions = _mm_setr_epi32(0, 1, 2, 3);
__m128i _4 = _mm_set1_epi32(4);
__m128i _1 = _mm_set1_epi32(1);
__m128i _cnt = _mm_setzero_si128();
for (int i = 0; i < num; i += 4)
{
__m128i vec = _mm_loadu_si128((__m128i*)(arr + i));
__m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec);//arr >= comp2
_cnt = _mm_add_epi32(_cnt, _mm_and_si128(simdAnd2, _1));
int comp = _mm_movemask_epi8(simdAnd2);
if (comp == 65535)
{
_mm_storeu_si128((__m128i*)resPos, positions);
resPos += 4;
}
else if (comp < 65535)
{
if (((uint32_t *)&simdAnd2)[0]) {
std::cout << "first byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[0];
resPos++;
}
if (((uint32_t *)&simdAnd2)[1]) {
std::cout << "second byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[1];
resPos++;
}
if (((uint32_t *)&simdAnd2)[2]) {
std::cout << "3rd byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[2];
resPos++;
}
if (((uint32_t *)&simdAnd2)[3]) {
std::cout << "4th byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[3];
resPos++;
}
}
positions = _mm_add_epi32(positions, _4);
}
uint32_t cnts[4];
_mm_storeu_si128((__m128i*)cnts, _cnt);
*cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3];
std::cout << "cnt " << *cnt << "\n";
return resArr;
}
当然,如果循环中的所有标量指令都改为向量指令,那将是很好的。
答案 2 :(得分:0)
这是一个带有pshufb技巧的版本来进行压缩,但未经过测试,而且随机播放的面具不应该是本地的。
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
uint32_t numcnt = 4;
uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set1_epi32(10);
__m128i positions = _mm_setr_epi32(0, 1, 2, 3);
__m128i _4 = _mm_set1_epi32(4);
__m128i _1 = _mm_set1_epi32(1);
int count = 0;
const int X = 0x80808080;
__m128i compaction_masks[16];
compaction_masks[0] = _mm_set1_epi8(0x80);
compaction_masks[1] = _mm_set_epi32(X, X, X, 0x03020100);
compaction_masks[2] = _mm_set_epi32(X, X, X, 0x07060504);
compaction_masks[3] = _mm_set_epi32(X, X, 0x07060504, 0x03020100);
compaction_masks[4] = _mm_set_epi32(X, X, X, 0x0B0A0908);
compaction_masks[5] = _mm_set_epi32(X, X, 0x0B0A0908, 0x03020100);
compaction_masks[6] = _mm_set_epi32(X, X, 0x0B0A0908, 0x07060504);
compaction_masks[7] = _mm_set_epi32(X, 0x0B0A0908, 0x07060504, 0x03020100);
compaction_masks[8] = _mm_set_epi32(X, X, X, 0x0F0E0D0C);
compaction_masks[9] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x03020100);
compaction_masks[10] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x07060504);
compaction_masks[11] = _mm_set_epi32(X, 0x0F0E0D0C, 0x07060504, 0x03020100);
compaction_masks[12] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x0B0A0908);
compaction_masks[13] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x03020100);
compaction_masks[14] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x07060504);
compaction_masks[15] = _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100);
for (int i = 0; i < num; i += 4)
{
__m128i vec = _mm_loadu_si128((__m128i*)(arr + i));
__m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec);
int comp = _mm_movemask_ps(_mm_castsi128_ps(simdAnd2));
__m128i shufmask = compaction_masks[comp];
vec = _mm_shuffle_epi8(positions, shufmask);
_mm_storeu_si128((__m128i*)resPos, vec);
resPos += __builtin_popcount(comp);
count += __builtin_popcount(comp);
positions = _mm_add_epi32(positions, _4);
}
*cnt = count;
return resArr;
}
这里的想法是每个案例当然可以改组到位,通过加载对应于案例索引的随机掩码来区分16个案例,案例索引由movmskps
给出。使用AVX2,您可以使用vpermd
执行类似操作。