绝对差之和的SSE等效代码

时间:2014-07-01 15:09:37

标签: visual-studio-2010 h.264 sse encoder decoder

我在H264AVC编码器/解码器中有这个功能,它被重复调用:

UInt XDistortion::xGetSAD16x( XDistSearchStruct* pcDSS )
{
  XPel* pucCur  = pcDSS->pYSearch;
  XPel* pucOrg  = pcDSS->pYOrg;
  Int   iStride = pcDSS->iYStride;
  Int   iRows   = pcDSS->iRows;

  UInt uiSum = 0;

  for( ; iRows != 0; iRows-- )
  {
    uiSum += Abs( pucOrg[0x0] - pucCur[0x0] );
    uiSum += Abs( pucOrg[0x1] - pucCur[0x1] );
    uiSum += Abs( pucOrg[0x2] - pucCur[0x2] );
    uiSum += Abs( pucOrg[0x3] - pucCur[0x3] );
    uiSum += Abs( pucOrg[0x4] - pucCur[0x4] );
    uiSum += Abs( pucOrg[0x5] - pucCur[0x5] );
    uiSum += Abs( pucOrg[0x6] - pucCur[0x6] );
    uiSum += Abs( pucOrg[0x7] - pucCur[0x7] );
    uiSum += Abs( pucOrg[0x8] - pucCur[0x8] );
    uiSum += Abs( pucOrg[0x9] - pucCur[0x9] );
    uiSum += Abs( pucOrg[0xa] - pucCur[0xa] );
    uiSum += Abs( pucOrg[0xb] - pucCur[0xb] );
    uiSum += Abs( pucOrg[0xc] - pucCur[0xc] );
    uiSum += Abs( pucOrg[0xd] - pucCur[0xd] );
    uiSum += Abs( pucOrg[0xe] - pucCur[0xe] );
    uiSum += Abs( pucOrg[0xf] - pucCur[0xf] );
    pucOrg += MB_BUFFER_WIDTH;
    pucCur += iStride;
  }
  return uiSum;
}

我已使用以下SSE代码替换它,但它无效:

UInt XDistortion::xGetSAD16x( XDistSearchStruct* pcDSS )
{
    XPel* pucCur  = pcDSS->pYSearch;
    XPel* pucOrg  = pcDSS->pYOrg;
    Int   iStride = pcDSS->iYStride;
    Int   iRows   = pcDSS->iRows;

    __m128i uiSum = _mm_set1_epi32(0);
    __m128i x1 = _mm_set1_epi8(MB_BUFFER_WIDTH);
    __m128i x2 = _mm_set1_epi8(iStride);

    for( ; iRows > 0; iRows -= 4)
        {
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x0]), _mm_load_si128((__m128i*)&pucCur[0x0])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x1]), _mm_load_si128((__m128i*)&pucCur[0x1])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x2]), _mm_load_si128((__m128i*)&pucCur[0x2])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x3]), _mm_load_si128((__m128i*)&pucCur[0x3])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x4]), _mm_load_si128((__m128i*)&pucCur[0x4])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x5]), _mm_load_si128((__m128i*)&pucCur[0x5])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x6]), _mm_load_si128((__m128i*)&pucCur[0x6])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x7]), _mm_load_si128((__m128i*)&pucCur[0x7])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x8]), _mm_load_si128((__m128i*)&pucCur[0x8])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0x9]), _mm_load_si128((__m128i*)&pucCur[0x9])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xa]), _mm_load_si128((__m128i*)&pucCur[0xa])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xb]), _mm_load_si128((__m128i*)&pucCur[0xb])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xc]), _mm_load_si128((__m128i*)&pucCur[0xc])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xd]), _mm_load_si128((__m128i*)&pucCur[0xd])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xe]), _mm_load_si128((__m128i*)&pucCur[0xe])))));
            _mm_store_si128(&uiSum, _mm_add_epi32(_mm_load_si128(&uiSum), _mm_abs_epi32(_mm_sub_epi32(_mm_load_si128((__m128i*)&pucOrg[0xf]), _mm_load_si128((__m128i*)&pucCur[0xf])))));

            _mm_store_si128((__m128i*)pucOrg, _mm_add_epi32(_mm_load_si128((__m128i*)pucOrg), _mm_load_si128(&x1)));
            _mm_store_si128((__m128i*)pucCur, _mm_add_epi32(_mm_load_si128((__m128i*)pucCur), _mm_load_si128(&x2)));
        }
        return _mm_extract_epi32(uiSum, 0);
}

我不知道在这次转换中我是否有任何错误,因为我不是SSE的专家。 有什么帮助吗?

0 个答案:

没有答案