从SSE向量中提取标量值

时间:2013-06-17 08:45:15

标签: c x86 sse simd

我有一段代码可以对数组元素进行比较,如果它们是>而不是一个值,以SIMD-ish方式:

void sse(uint *dst, size_t N)
{
    const __m128i condition = _mm_set1_epi32(2);

    for (uint i = 0; i < N; i += 4)
    {
        __m128i v = _mm_load_si128((__m128i *)&dst[i]);
        __m128i cmp = _mm_cmpgt_epi32(v, condition);
        v = _mm_and_si128(v, cmp);
        _mm_store_si128((__m128i *)&dst[i], v);
    }
}

现在,在比较之后,在 anding 元素 - _mm_and_si128之前,我想计算通过条件的元素,即那些设置为'1'的元素,并将总和存储在int变量。我怎么能在SIMD中做到这一点?例如,如果四个中只有两个通过了条件,则将此int var = 2。

1 个答案:

答案 0 :(得分:7)

通常,您将在整个循环中保持向量计数,然后在循环终止时仅对向量的元素求和,例如

#include <emmintrin.h>

uint32_t sse(const uint32_t *dst, const size_t N)
{
    const __m128i condition = _mm_set1_epi32(2);
    __m128i vcount = _mm_set1_epi32(0);
    uint32_t count = 0;

    for (size_t i = 0; i < N; i += 4)
    {
        __m128i v = _mm_load_si128((__m128i *)&dst[i]);
        __m128i vcmp = _mm_cmpgt_epi32(v, condition);
        v = _mm_and_si128(v, vcmp);
        _mm_store_si128((__m128i *)&dst[i], v);
        vcount = _mm_add_epi32(vcount, vcmp); // accumulate (negative) counts
    }
    // ... sum vcount here and store in count (see below) ...
    return count;
}

请注意,我们将每个掩码元素视为一个int,即0或-1,因此我们正在积累一个总和,它是实际和的负数。

最终vcount总和的效率通常不太重要,因为它只对整个循环执行一次,因此如果N相当大,那么无论需要多少指令(在原因)。

有几种处理最终总和的方法,例如:您可以使用_mm_movemask_epi8(SSE2)提取16位掩码并使用它,或者您可以使用_mm_hadd_epi32(SSSE3)计算向量上的水平和,然后将总和提取为标量,例如

SSE2:

#include <emmintrin.h>

int16_t mask = _mm_movemask_epi8(vcount);       // extract 16 bit mask
count = !!(mask & 0x0001) +                     // count non-zero 32 bit elements
        !!(mask & 0x0010) + 
        !!(mask & 0x0100) + 
        !!(mask & 0x1000);

SSSE3:

#include <tmmintrin.h>

vcount = _mm_hadd_epi32(vcount, vcount);        // horizontal sum of 4 elements
vcount = _mm_hadd_epi32(vcount, vcount);
count = - ((_mm_extract_epi16(vcount, 1) << 16) // extract (and negate) sum to
          | _mm_extract_epi16(vcount, 1));      // get total (positive) count

SSE4.2:

#include <smmintrin.h>

vcount = _mm_hadd_epi32(vcount, vcount);        // horizontal sum of 4 elements
vcount = _mm_hadd_epi32(vcount, vcount);
count = - _mm_extract_epi32(vcount, 0);         // extract (and negate) sum to
                                                // get total (positive) count

这是一个完整的工作版本,其中包含SSE4.2版本的测试工具:

#include <stdio.h>
#include <stdint.h>
#include <smmintrin.h>

uint32_t sse(const uint32_t *dst, const size_t N)
{
    const __m128i condition = _mm_set1_epi32(2);
    __m128i vcount = _mm_set1_epi32(0);
    uint32_t count = 0;

    for (size_t i = 0; i < N; i += 4)
    {
        __m128i v = _mm_load_si128((__m128i *)&dst[i]);
        __m128i vcmp = _mm_cmpgt_epi32(v, condition);
        v = _mm_and_si128(v, vcmp);
        _mm_store_si128((__m128i *)&dst[i], v);
        vcount = _mm_add_epi32(vcount, vcmp); // accumulate (negative) counts
    }

    vcount = _mm_hadd_epi32(vcount, vcount);    // horizontal sum of 4 elements
    vcount = _mm_hadd_epi32(vcount, vcount);
    count = - _mm_extract_epi32(vcount, 0);     // extract (and negate) sum to
                                                // get total (positive) count

    return count;
}

int main(void)
{
    uint32_t a[4] __attribute__ ((aligned(16))) = { 1, 2, 3, 4 };
    uint32_t count;

    count = sse(a, 4);

    printf("a = %u %u %u %u \n", a[0], a[1], a[2], a[3]);
    printf("count = %u\n", count);

    return 0;
}

$ gcc -Wall -std=c99 -msse4 sse_count.c -o sse_count
$ ./sse_count
a = 0 0 3 4 
count = 2