Question

我想编写一个计算

的函数norm2

uint32_t norm2(uint32_t a, uint32_t b) {
  return sqd( a & 0x000000FF     ,  b & 0x000000FF      )
       + sqd((a & 0x0000FF00)>> 8, (b & 0x0000FF00)>>  8)
       + sqd((a & 0x00FF0000)>>16, (b & 0x00FF0000)>> 16)
       + sqd((a & 0xFF000000)>>24, (b & 0xFF000000)>> 24);
}
uint32_t sqd(uint32_t a, uint32_t b) {
  uint32_t x = (a > b) ? a - b : b - a;
  return x*x;
}

GCC下最快的方法是什么？例如，使用汇编程序，SSE或类似程序。

Answer 1

使用SSE在一些指令中完成整个操作非常简单：

#include <immintrin.h>
#include <stdint.h>

uint32_t norm2(uint32_t a, uint32_t b) {
    const __m128i vec_zero = _mm_setzero_si128();
    __m128i vec_a = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a), vec_zero);
    __m128i vec_b = _mm_unpacklo_epi8(_mm_cvtsi32_si128(b), vec_zero);
    __m128i vec_diff = _mm_sub_epi16(vec_a, vec_b);
    __m128i vec_dsq = _mm_madd_epi16(vec_diff, vec_diff);
    return _mm_cvtsi128_si32(_mm_hadd_epi32(vec_dsq, vec_dsq));
}

我们在这里做的是“解包”带有零向量的a和b，以将各个字节扩展为16位整数的向量。然后我们减去它们（作为16位整数，避免溢出的风险），并乘以并累加它们（作为32位整数，再次避免溢出的风险）。

我没有安装GCC进行测试，但是上面用clang产生了接近最佳的装配;没有必要为这么简单的任务投入装配。

Answer 2

如果您可以读取a和b的4个集合，则可以通过对4元组进行操作来最干净/优雅/高效地完成，因为它将使某些指令更加完全饱和，从而计算的所有部分是解决方案的一部分。以下解决方案最多使用SSSE3。当然，您最好将其从函数中拉出来，预先初始化常量，并找到将值放入__m128i值的最有效方法，具体取决于周围代码的结构。

// a, b, and out, must all point to 4 integers
void norm2x4(const unsigned *a, const unsigned *b, unsigned *out) {
  // load up registers a and b, in practice this should probably not be in a function,
  // initialization of zero can happen outside of a loop,
  // and a and b can be loaded directly from memory into __m128i registers
  __m128i const zero = _mm_setzero_si128();
  __m128i       alo  = _mm_loadu_si128((__m128i*)a); // this can also be adapted to aligned read instructions if you ensure an aligned buffer
  __m128i       blo  = _mm_loadu_si128((__m128i*)b);

  // everything is already in the register where we need it except it
  // needs to be expanded to 2-byte ints for computations to work correctly
  __m128i       ahi = _mm_unpackhi_epi8(alo, zero);
  __m128i       bhi = _mm_unpackhi_epi8(blo, zero);
  alo               = _mm_unpacklo_epi8(alo, zero);
  blo               = _mm_unpacklo_epi8(blo, zero);
  alo               = _mm_sub_epi16(alo, blo);  // don't care if a - b, or b - a, the "wrong" one will result in a
  ahi               = _mm_sub_epi16(ahi, bhi);  // negation the square will later correct
  alo               = _mm_madd_epi16(alo, alo); // perform the square, and add every two adjacent
  ahi               = _mm_madd_epi16(ahi, ahi);
  alo               = _mm_hadd_epi32(alo, ahi); // add horizontal elements; ahi now contains 4 ints which are your results

  // store the result to output; this can be adapted to an aligned store if you ensure an aligned buffer
  // or the individual values can be extracted directly to 32-bit registers using _mm_extract_epi32
  _mm_storeu_si128((__m128i*)out, alo);
}

Answer 3

无分支版本（square(-x) == square(x)）：

uint32_t sqd(int32_t a, int32_t b) {
  int32_t x = a - b;
  return x * x;
}

uint32_t norm2(uint32_t a, uint32_t b) {
  return sqd( a & 0x000000FF     , b &  0x000000FF      )
       + sqd((a & 0x0000FF00) >>  8, (b & 0x0000FF00) >>  8)
       + sqd((a & 0x00FF0000) >> 16, (b & 0x00FF0000) >> 16)
       + sqd((a & 0xFF000000) >> 24, (b & 0xFF000000) >> 24);
}

计算两个整数之间的规范，解释为4个字节

3 个答案: