在c ++中使用SIMD指令来比较值并存储它们

时间:2017-03-22 11:05:04

标签: c++11 simd

所以我想在C ++中使用SIMD指令来比较uint32_t数组中的值,并将值存储回新数组中。 它或多或少都可以正常工作,但是我仍然使用4个if子句来确定我在SIMD指令后写的值是否回写了值。

有没有办法用SIMD指令做到这一点? 函数allocateAlignedBuffer执行名称所说的并且正常工作。

uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t* cnt) {

    uint32_t numcnt = 4;
    uint32_t * resArr = allocateAlignedBuffer<uint32_t>(num, true);
    uint32_t * resPos = resArr;

    *cnt = 0;

    __m128i comp2 = _mm_set_epi32(10,10,10,10);

    for (int i = 0; i < num; i+=4) {
       __m128i positions = _mm_set_epi32(i+3,i+2,i+1,i);
        __m128i vec = _mm_load_si128 ( reinterpret_cast<const __m128i*>  ( (&arr[i]) ) );

        __m128i simdAnd2 = _mm_cmpge_ps(vec, comp2);

        int  comp = _mm_movemask_epi8 (simdAnd2); 

        if (comp == 0x0000) {
            //std::cout << "nothing found\n";
            continue;
        }
        else if (comp < 65535) {

            if (  ((uint32_t *) &simdAnd2)[0] ){
                    std::cout << "first byte not 0\n";
                    resPos[0] = ((uint32_t *) &positions)[0];
                    resPos++;
                    *cnt++;
                }
            if (((uint32_t *) &simdAnd2)[1]){
                    std::cout << "second byte not 0\n";
                    resPos[0] = ((uint32_t *) &positions)[1];
                    resPos++;
                    *cnt++;
                }          
            if (((uint32_t *) &simdAnd2)[2]){
                std::cout << "3rd byte not 0\n";
                    resPos[0] = ((uint32_t *) &positions)[2];
                    resPos++;
                    *cnt++;
                }
            if (((uint32_t *) &simdAnd2)[3]){
                    std::cout << "4th byte not 0\n";
                    resPos[0] = ((uint32_t *) &positions)[3];
                    resPos++;
                    *cnt++;
                }
        }
        else { //all elements equal
            resPos[0] = ((uint32_t *) &positions)[0];
            resPos[1] = ((uint32_t *) &positions)[1];
            resPos[2] = ((uint32_t *) &positions)[2];
            resPos[3] = ((uint32_t *) &positions)[3];
            resPos += numcnt;
            *cnt += numcnt;
        }

    }

std::cout << "cnt "<<*cnt<<"\n";
return resArr;
}

我相信也可能有很多优化。

3 个答案:

答案 0 :(得分:1)

使用shuffle的另一种变体:

__m128i g_shuffles[16] = 
{
    _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(0, 1, 2, 3,  8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(4, 5, 6, 7,  8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0),
    _mm_setr_epi8(12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(0, 1, 2, 3, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0),
    _mm_setr_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
    _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0),
    _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0),
    _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
};
uint32_t g_steps[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };

uint32_t * testFunc2(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
    uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
    uint32_t * resPos = resArr;

    *cnt = 0;

    __m128i threshold = _mm_set1_epi32(10 - 1);
    __m128i positions = _mm_setr_epi32(0, 1, 2, 3);
    __m128i _4 = _mm_set1_epi32(4);
    __m128i _1 = _mm_set1_epi32(1);
    __m128i _cnt = _mm_setzero_si128();

    for (int i = 0; i < num; i += 4)
    {
        __m128i _arr = _mm_loadu_si128((__m128i*)(arr + i));

        __m128i comparemask = _mm_cmpgt_epi32(_arr, threshold);

        _cnt = _mm_add_epi32(_cnt, _mm_and_si128(comparemask, _1));

        int index = _mm_movemask_ps(_mm_castsi128_ps(comparemask));

        __m128i storePositions = _mm_shuffle_epi8(positions, g_shuffles[index]);
        _mm_storeu_si128((__m128i*)resPos, storePositions);
        resPos += g_steps[index];

        positions = _mm_add_epi32(positions, _4);
    }

    uint32_t cnts[4];
    _mm_storeu_si128((__m128i*)cnts, _cnt);
    *cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3];

    std::cout << "cnt " << *cnt << "\n";
    return resArr;
}

答案 1 :(得分:0)

我做了一些改变,必须提高绩效:

#include <immintrin.h>
#include <memory.h>

uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt) 
{
    uint32_t numcnt = 4;
    uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
    uint32_t * resPos = resArr;

    *cnt = 0;

    __m128i comp2 = _mm_set1_epi32(10);
    __m128i positions = _mm_setr_epi32(0, 1, 2, 3);
    __m128i _4 = _mm_set1_epi32(4);
    __m128i _1 = _mm_set1_epi32(1);
    __m128i _cnt = _mm_setzero_si128();

    for (int i = 0; i < num; i += 4) 
    {
        __m128i vec = _mm_loadu_si128((__m128i*)(arr + i));

        __m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec);//arr >= comp2

        _cnt = _mm_add_epi32(_cnt, _mm_and_si128(simdAnd2, _1));

        int comp = _mm_movemask_epi8(simdAnd2);

        if (comp == 65535) 
        {
            _mm_storeu_si128((__m128i*)resPos, positions);
            resPos += 4;
        }
        else if (comp < 65535) 
        {
            if (((uint32_t *)&simdAnd2)[0]) {
                std::cout << "first byte not 0\n";
                resPos[0] = ((uint32_t *)&positions)[0];
                resPos++;
            }
            if (((uint32_t *)&simdAnd2)[1]) {
                std::cout << "second byte not 0\n";
                resPos[0] = ((uint32_t *)&positions)[1];
                resPos++;
            }
            if (((uint32_t *)&simdAnd2)[2]) {
                std::cout << "3rd byte not 0\n";
                resPos[0] = ((uint32_t *)&positions)[2];
                resPos++;
            }
            if (((uint32_t *)&simdAnd2)[3]) {
                std::cout << "4th byte not 0\n";
                resPos[0] = ((uint32_t *)&positions)[3];
                resPos++;
            }
        }
        positions = _mm_add_epi32(positions, _4);
    }

    uint32_t cnts[4];
    _mm_storeu_si128((__m128i*)cnts, _cnt);
    *cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3];

    std::cout << "cnt " << *cnt << "\n";
    return resArr;
}

当然,如果循环中的所有标量指令都改为向量指令,那将是很好的。

答案 2 :(得分:0)

这是一个带有pshufb技巧的版本来进行压缩,但未经过测试,而且随机播放的面具不应该是本地的。

uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt) 
{
    uint32_t numcnt = 4;
    uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
    uint32_t * resPos = resArr;

    *cnt = 0;

    __m128i comp2 = _mm_set1_epi32(10);
    __m128i positions = _mm_setr_epi32(0, 1, 2, 3);
    __m128i _4 = _mm_set1_epi32(4);
    __m128i _1 = _mm_set1_epi32(1);
    int count = 0;

    const int X = 0x80808080;
    __m128i compaction_masks[16];
    compaction_masks[0] = _mm_set1_epi8(0x80);
    compaction_masks[1] = _mm_set_epi32(X, X, X, 0x03020100);
    compaction_masks[2] = _mm_set_epi32(X, X, X, 0x07060504);
    compaction_masks[3] = _mm_set_epi32(X, X, 0x07060504, 0x03020100);
    compaction_masks[4] = _mm_set_epi32(X, X, X, 0x0B0A0908);
    compaction_masks[5] = _mm_set_epi32(X, X, 0x0B0A0908, 0x03020100);
    compaction_masks[6] = _mm_set_epi32(X, X, 0x0B0A0908, 0x07060504);
    compaction_masks[7] = _mm_set_epi32(X, 0x0B0A0908, 0x07060504, 0x03020100);
    compaction_masks[8] = _mm_set_epi32(X, X, X, 0x0F0E0D0C);
    compaction_masks[9] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x03020100);
    compaction_masks[10] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x07060504);
    compaction_masks[11] = _mm_set_epi32(X, 0x0F0E0D0C, 0x07060504, 0x03020100);
    compaction_masks[12] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x0B0A0908);
    compaction_masks[13] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x03020100);
    compaction_masks[14] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x07060504);
    compaction_masks[15] = _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100);

    for (int i = 0; i < num; i += 4) 
    {
        __m128i vec = _mm_loadu_si128((__m128i*)(arr + i));
        __m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec);

        int comp = _mm_movemask_ps(_mm_castsi128_ps(simdAnd2));
        __m128i shufmask = compaction_masks[comp];
        vec = _mm_shuffle_epi8(positions, shufmask);
        _mm_storeu_si128((__m128i*)resPos, vec);
        resPos += __builtin_popcount(comp);
        count  += __builtin_popcount(comp);

        positions = _mm_add_epi32(positions, _4);
    }

    *cnt = count;

    return resArr;
}

这里的想法是每个案例当然可以改组到位,通过加载对应于案例索引的随机掩码来区分16个案例,案例索引由movmskps给出。使用AVX2,您可以使用vpermd执行类似操作。