我正在尝试使用SSE来提出一个非常快速的阈值算法来取代它:
uint8_t *pSrc, *pDst;
// Assume pSrc and pDst point to valid data
// Handle left edge
*pDst++ = *pSrc++;
// Likeness filter
for (uint32_t k = 2; k < width; k++, pSrc++, pDst++)
if ((*pDst - *pSrc) * (*pDst - *pSrc) > 100 /*THRESHOLD_SQUARED*/) {
*pDst = *pSrc;
}
}
// Handle right edge
*pDst++ = *pSrc++;
到目前为止,我有这个:
const uint8_t THRESHOLD = 10;
__attribute__((aligned (16))) static const uint8_t mask[16] = {
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};
__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();
uint8_t *pSrc, *pDst;
// Assume pSrc and pDst point to valid data
// I have other code with another mask for the first 16 entries
for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
xmm3 = _mm_load_si128((__m128i const *)pDst);
xmm4 = _mm_load_si128((__m128i const *)pSrc);
xmm5 = _mm_unpacklo_epi8(xmm3, xmm6);
xmm7 = _mm_unpackhi_epi8(xmm3, xmm6);
xmm8 = _mm_unpacklo_epi8(xmm4, xmm6);
xmm9 = _mm_unpackhi_epi8(xmm4, xmm6);
xmm5 = _mm_sub_epi16(xmm5, xmm8);
xmm7 = _mm_sub_epi16(xmm7, xmm9);
xmm5 = _mm_abs_epi16(xmm5);
xmm7 = _mm_abs_epi16(xmm7);
xmm5 = _mm_packs_epi16(xmm5, xmm7);
xmm5 = _mm_cmpgt_epi8(xmm5, xmm1);
xmm3 = _mm_blendv_epi8(xmm3, xmm4, xmm5);
_mm_store_si128((__m128i *)pDst, xmm3);
}
// I have other code with another mask for the last 16 entries
我有想法使用其他类型的算法来处理两个值的差异的绝对值(主要是留在U8(uchar)空间):
a' = a >> 1;
b' = b >> 1;
diff = (abs(sub(a' - b')) << 1) + ((a ^ b) & 1);
这将需要8个SSE指令而不是上面的9个(不包括编译器生成的任何额外寄存器移动)但我不确定它是否因为依赖性延迟而更快。
是否有其他SSE专家有更好的建议(使用SSE 4.2)?
更新1 - 感谢Yves的建议!
const uint8_t THRESHOLD = 10;
__attribute__((aligned (16))) static const uint8_t mask[16] = {
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};
__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();
uint8_t *pSrc, *pDst;
// Assume pSrc and pDst point to valid data
// I have other code with another mask for the first 16 entries
for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
xmm3 = _mm_load_si128((__m128i const *)pDst);
xmm4 = _mm_load_si128((__m128i const *)pSrc);
xmm5 = _mm_subs_epu8(xmm3, xmm4);
xmm7 = _mm_subs_epu8(xmm4, xmm3);
xmm5 = _mm_adds_epu8(xmm5, xmm7);
xmm5 = _mm_subs_epu8(xmm5, xmm1);
xmm5 = _mm_cmpeq_epi8(xmm5, xmm6);
xmm4 = _mm_blendv_epi8(xmm4, xmm3, xmm5);
_mm_store_si128((__m128i *)pDst, xmm4);
}
// I have other code with another mask for the last 16 entries
答案 0 :(得分:6)
有一种有效的替代方案可以计算绝对差值,利用算术饱和度。
实际上,饱和减法计算A - B = Max(A - B, 0)
,因此|A-B| = (A - B) + (B - A)
。
Diff= _mm_adds_epu8(_mm_subs_epu8(A, B), _mm_subs_epu8(B, A));
总和不会饱和。这样,您可以保持16 x 8位无符号并获得最大吞吐量。
答案 1 :(得分:1)
Simd库中有一些有用的功能:
inline __m128i Combine(__m128i mask, __m128i positive, __m128i negative)
{
return _mm_or_si128(_mm_and_si128(mask, positive), _mm_andnot_si128(mask, negative));
}
inline __m128i AbsDifferenceU8(__m128i a, __m128i b)
{
return _mm_sub_epi8(_mm_max_epu8(a, b), _mm_min_epu8(a, b));
}
inline __m128i LesserOrEqual8u(__m128i a, __m128i b)
{
return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
}
所以SSE2优化将如下所示:
__m128i t = _mm_set1_epi8(threshold);
for (uint32_t k = 16; k < width - 16; pSrc += 16, pDst += 16)
{
__m128i src = _mm_load_si128((__m128i*)pSrc);
__m128i dst = _mm_load_si128((__m128i*)pDst);
__m128i mask = LesserOrEqual8u(AbsDifferenceU8(src, dst), t);
_mm_strore_si128((__m128i*)pDst, Combine(mask, dst, src);
}