我在H264AVC编码器/解码器中有这个功能,它被重复调用:
UInt XDistortion::xGetSAD16x( XDistSearchStruct* pcDSS )
{
XPel* pucCur = pcDSS->pYSearch;
XPel* pucOrg = pcDSS->pYOrg;
Int iStride = pcDSS->iYStride;
Int iRows = pcDSS->iRows;
UInt uiSum = 0;
for( ; iRows != 0; iRows-- )
{
uiSum += Abs( pucOrg[0x0] - pucCur[0x0] );
uiSum += Abs( pucOrg[0x1] - pucCur[0x1] );
uiSum += Abs( pucOrg[0x2] - pucCur[0x2] );
uiSum += Abs( pucOrg[0x3] - pucCur[0x3] );
uiSum += Abs( pucOrg[0x4] - pucCur[0x4] );
uiSum += Abs( pucOrg[0x5] - pucCur[0x5] );
uiSum += Abs( pucOrg[0x6] - pucCur[0x6] );
uiSum += Abs( pucOrg[0x7] - pucCur[0x7] );
uiSum += Abs( pucOrg[0x8] - pucCur[0x8] );
uiSum += Abs( pucOrg[0x9] - pucCur[0x9] );
uiSum += Abs( pucOrg[0xa] - pucCur[0xa] );
uiSum += Abs( pucOrg[0xb] - pucCur[0xb] );
uiSum += Abs( pucOrg[0xc] - pucCur[0xc] );
uiSum += Abs( pucOrg[0xd] - pucCur[0xd] );
uiSum += Abs( pucOrg[0xe] - pucCur[0xe] );
uiSum += Abs( pucOrg[0xf] - pucCur[0xf] );
pucOrg += MB_BUFFER_WIDTH;
pucCur += iStride;
}
return uiSum;
}
我已使用以下SSE代码替换它,但它无效:
UInt XDistortion::xGetSAD16x( XDistSearchStruct* pcDSS )
{
XPel* pucCur = pcDSS->pYSearch;
XPel* pucOrg = pcDSS->pYOrg;
Int iStride = pcDSS->iYStride;
Int iRows = pcDSS->iRows;
__m128i uiSum = _mm_set1_epi32(0);
__m128i x1 = _mm_set1_epi8(MB_BUFFER_WIDTH);
__m128i x2 = _mm_set1_epi8(iStride);
for( ; iRows > 0; iRows -= 4)
{
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x0]), _mm_load_si128((__m128i*)&pucCur[0x0])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x1]), _mm_load_si128((__m128i*)&pucCur[0x1])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x2]), _mm_load_si128((__m128i*)&pucCur[0x2])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x3]), _mm_load_si128((__m128i*)&pucCur[0x3])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x4]), _mm_load_si128((__m128i*)&pucCur[0x4])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x5]), _mm_load_si128((__m128i*)&pucCur[0x5])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x6]), _mm_load_si128((__m128i*)&pucCur[0x6])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x7]), _mm_load_si128((__m128i*)&pucCur[0x7])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x8]), _mm_load_si128((__m128i*)&pucCur[0x8])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x9]), _mm_load_si128((__m128i*)&pucCur[0x9])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xa]), _mm_load_si128((__m128i*)&pucCur[0xa])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xb]), _mm_load_si128((__m128i*)&pucCur[0xb])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xc]), _mm_load_si128((__m128i*)&pucCur[0xc])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xd]), _mm_load_si128((__m128i*)&pucCur[0xd])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xe]), _mm_load_si128((__m128i*)&pucCur[0xe])))));
_mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xf]), _mm_load_si128((__m128i*)&pucCur[0xf])))));
_mm_store_si128((__m128i*)pucOrg, _mm_add_epi32(_mm_load_si128((__m128i*)pucOrg), _mm_load_si128(&x1)));
_mm_store_si128((__m128i*)pucCur, _mm_add_epi32(_mm_load_si128((__m128i*)pucCur), _mm_load_si128(&x2)));
}
return _mm_extract_epi32(uiSum, 0);
}
我不知道在这次转换中我是否有任何错误,因为我不是SSE的专家。 有什么帮助吗?