我想编写一个计算
的函数norm2
uint32_t norm2(uint32_t a, uint32_t b) {
return sqd( a & 0x000000FF , b & 0x000000FF )
+ sqd((a & 0x0000FF00)>> 8, (b & 0x0000FF00)>> 8)
+ sqd((a & 0x00FF0000)>>16, (b & 0x00FF0000)>> 16)
+ sqd((a & 0xFF000000)>>24, (b & 0xFF000000)>> 24);
}
uint32_t sqd(uint32_t a, uint32_t b) {
uint32_t x = (a > b) ? a - b : b - a;
return x*x;
}
GCC下最快的方法是什么?例如,使用汇编程序,SSE或类似程序。
答案 0 :(得分:4)
使用SSE在一些指令中完成整个操作非常简单:
#include <immintrin.h>
#include <stdint.h>
uint32_t norm2(uint32_t a, uint32_t b) {
const __m128i vec_zero = _mm_setzero_si128();
__m128i vec_a = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a), vec_zero);
__m128i vec_b = _mm_unpacklo_epi8(_mm_cvtsi32_si128(b), vec_zero);
__m128i vec_diff = _mm_sub_epi16(vec_a, vec_b);
__m128i vec_dsq = _mm_madd_epi16(vec_diff, vec_diff);
return _mm_cvtsi128_si32(_mm_hadd_epi32(vec_dsq, vec_dsq));
}
我们在这里做的是“解包”带有零向量的a和b,以将各个字节扩展为16位整数的向量。然后我们减去它们(作为16位整数,避免溢出的风险),并乘以并累加它们(作为32位整数,再次避免溢出的风险)。
我没有安装GCC进行测试,但是上面用clang产生了接近最佳的装配;没有必要为这么简单的任务投入装配。
答案 1 :(得分:2)
如果您可以读取a和b的4个集合,则可以通过对4元组进行操作来最干净/优雅/高效地完成,因为它将使某些指令更加完全饱和,从而计算的所有部分是解决方案的一部分。以下解决方案最多使用SSSE3。当然,您最好将其从函数中拉出来,预先初始化常量,并找到将值放入__m128i
值的最有效方法,具体取决于周围代码的结构。
// a, b, and out, must all point to 4 integers
void norm2x4(const unsigned *a, const unsigned *b, unsigned *out) {
// load up registers a and b, in practice this should probably not be in a function,
// initialization of zero can happen outside of a loop,
// and a and b can be loaded directly from memory into __m128i registers
__m128i const zero = _mm_setzero_si128();
__m128i alo = _mm_loadu_si128((__m128i*)a); // this can also be adapted to aligned read instructions if you ensure an aligned buffer
__m128i blo = _mm_loadu_si128((__m128i*)b);
// everything is already in the register where we need it except it
// needs to be expanded to 2-byte ints for computations to work correctly
__m128i ahi = _mm_unpackhi_epi8(alo, zero);
__m128i bhi = _mm_unpackhi_epi8(blo, zero);
alo = _mm_unpacklo_epi8(alo, zero);
blo = _mm_unpacklo_epi8(blo, zero);
alo = _mm_sub_epi16(alo, blo); // don't care if a - b, or b - a, the "wrong" one will result in a
ahi = _mm_sub_epi16(ahi, bhi); // negation the square will later correct
alo = _mm_madd_epi16(alo, alo); // perform the square, and add every two adjacent
ahi = _mm_madd_epi16(ahi, ahi);
alo = _mm_hadd_epi32(alo, ahi); // add horizontal elements; ahi now contains 4 ints which are your results
// store the result to output; this can be adapted to an aligned store if you ensure an aligned buffer
// or the individual values can be extracted directly to 32-bit registers using _mm_extract_epi32
_mm_storeu_si128((__m128i*)out, alo);
}
答案 2 :(得分:1)
无分支版本(square(-x) == square(x)
):
uint32_t sqd(int32_t a, int32_t b) {
int32_t x = a - b;
return x * x;
}
uint32_t norm2(uint32_t a, uint32_t b) {
return sqd( a & 0x000000FF , b & 0x000000FF )
+ sqd((a & 0x0000FF00) >> 8, (b & 0x0000FF00) >> 8)
+ sqd((a & 0x00FF0000) >> 16, (b & 0x00FF0000) >> 16)
+ sqd((a & 0xFF000000) >> 24, (b & 0xFF000000) >> 24);
}