添加饱和32位有符号整数内在函数?

时间:2015-04-07 18:41:09

标签: intel intrinsics fixed-point

有人建议使用intel intrinsics(AVX,SSE4 ......)快速添加饱和32位有符号整数吗?

我查看了内在指南并找到了_mm256_adds_epi16,但这似乎只增加了16位整数。我没有看到任何类似的32位。其他电话似乎四处传播。

提示

3 个答案:

答案 0 :(得分:2)

以下情况(且仅在以下情况下)会发生签名溢出:

  • 两个输入的符号相同,并且
  • 总和的符号(加上环绕符号时)与输入的符号不同

使用C操作员:overflow = ~(a^b) & (a^(a+b))

此外,如果发生溢出,则饱和结果将具有与任一输入相同的符号。使用@PeterCordes建议的int_min = int_max+1技巧,并假设您至少具有SSE4.1(对于blendvps),可以实现为:

__m128i __mm_adds_epi32( __m128i a, __m128i b )
{
    const __m128i int_max = _mm_set1_epi32( 0x7FFFFFFF );

    // normal result (possibly wraps around)
    __m128i res      = _mm_add_epi32( a, b );

    // If result saturates, it has the same sign as both a and b
    __m128i sign_bit = _mm_srli_epi32(a, 31); // shift sign to lowest bit
    __m128i saturated = _mm_add_epi32(int_max, sign_bit);

    // saturation happened if inputs do not have different signs, 
    // but sign of result is different:
    __m128i sign_xor  = _mm_xor_si128( a, b );
    __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a,res));

    return _mm_castps_si128(_mm_blendv_ps( _mm_castsi128_ps(saturated),
                                          _mm_castsi128_ps( res ),
                                          _mm_castsi128_ps( overflow ) ) );
}

如果您的blendvps快于(或快于)班次和加法(还考虑端口使用情况),那么您当然可以将int_minint_maxa的符号位。 此外,如果只有SSE2或SSE3,则可以通过向右移31位(overflow)算术移位(__m256i)和手动混合(使用and / andand / or)来替换最后一个混合。

自然,使用AVX2,它可以使用__m128i变量而不是a(应该很容易重写)。

附录如果在编译时知道bsaturated的符号,则可以直接设置_mm_xor_si128,并保存两个overflow计算,即对于正值_mm_andnot_si128(b, res)a将是_mm_andnot(res, b),对于负值ares = a+b将是public class FeedMessageValue { public string Username { get; set; } public string SubscriptionID { get; set; } public DateTime MessageTime { get; set; } } public class FeedMessageData : IMailData { private FeedMessageValue feedMessageValue; public FeedMessageData(string username, string subscriptionID, DateTime messageTime) { this.feedMessageValue = new FeedMessageValue { Username = username , SubscriptionID = subscriptionID , MessageTime = messageTime }; PropertyInfo[] infos = this.feedMessageValue.GetType().GetProperties(); foreach (PropertyInfo info in infos) { this.getMergeValues().Add(info.Name, info.GetValue(this.feedMessageValue, null).ToString()); } } public Dictionary<string, string> getMergeValues() { return new Dictionary<string, string>(); } } (使用public interface IMailData { Dictionary<string, string> getMergeValues(); } public interface IEmailGenerator { MailMessage generateEmail(IMailData mailData, string htmlTemplate, string textTemplate); } public class EmailGenerator : IEmailGenerator, IRegisterInIoC { // Setup the rules static readonly Regex emailRegex = new Regex(@"\$([\w\-\,\.]+)\$", RegexOptions.Compiled); private string mergeTemplate(string template, IReadOnlyDictionary<string, string> values) { string emailTextData = emailRegex.Replace(template, match => values[match.Groups[1].Value]); return emailTextData; } public MailMessage generateEmail(IMailData mailData, string htmlTemplate, string textTemplate) { // MailMessage } } )。

答案 1 :(得分:1)

这是一个适用于 SSE2 的版本,改进了 SSE4.1 (_mm_blendv_ps)、AVX-512VL (_mm_ternarylogic_epi32) 和 AVX-512DQ(_mm_movepi32_mask,在 Peter科德斯的建议)。

__m128i __mm_adds_epi32( __m128i a, __m128i b) {
  const __m128i int_max = _mm_set1_epi32(INT32_MAX);

  /* normal result (possibly wraps around) */
  const __m128i res = _mm_add_epi32(a, b);

  /* If result saturates, it has the same sign as both a and b */
  const __m128i sign_bit = _mm_srli_epi32(a, 31); /* shift sign to lowest bit */

  #if defined(__AVX512VL__)
    const __m128i overflow = _mm_ternarylogic_epi32(a, b, res, 0x42);
  #else
    const __m128i sign_xor = _mm_xor_si128(a, b);
    const __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res));
  #endif

  #if defined(__AVX512DQ__) && defined(__AVX512VL__)
    return _mm_mask_add_epi32(res, _mm_movepi32_mask(overflow), int_max, sign_bit);
  #else
    const __m128i saturated = _mm_add_epi32(int_max, sign_bit);

    #if defined(__SSE4_1__)
      return
        _mm_castps_si128(
          _mm_blendv_ps(
            _mm_castsi128_ps(res),
            _mm_castsi128_ps(saturated),
            _mm_castsi128_ps(overflow)
          )
        );
    #else
      const __m128i overflow_mask = _mm_srai_epi32(overflow, 31);
      return
        _mm_or_si128(
          _mm_and_si128(overflow_mask, saturated),
          _mm_andnot_si128(overflow_mask, res)
        );
    #endif
  #endif
}

我为 SIMDe 的 NEON vqaddq_s32(和 MSA __msa_adds_s_b)的实现做了这个;如果您需要其他版本,您应该能够从 simde/arm/neon/qadd.h 改编它们。对于 128 位向量,除了 SSE 支持的(8/16 位,有符号和无符号)之外,还有:

  • vaddq_s32(想想_mm_adds_epi32
  • vaddq_s64(想想_mm_adds_epi64
  • vaddq_u32(想想_mm_adds_epu32

vaddq_u64(想想 _mm_adds_epu64)也存在,但目前依赖于向量扩展。我可以(也可能应该)将生成的代码移植到内部函数中,但 TBH 我不确定如何改进它,所以我没有打扰。

答案 2 :(得分:0)

此链接回答了这个问题:

https://software.intel.com/en-us/forums/topic/285219

以下是一个示例实现:

#include <immintrin.h>

__m128i __inline __mm_adds_epi32( __m128i a, __m128i b )
{
    static __m128i int_min = _mm_set1_epi32( 0x80000000 );
    static __m128i int_max = _mm_set1_epi32( 0x7FFFFFFF );

    __m128i res      = _mm_add_epi32( a, b );
    __m128i sign_and = _mm_and_si128( a, b );
    __m128i sign_or  = _mm_or_si128( a, b );

    __m128i min_sat_mask = _mm_andnot_si128( res, sign_and );
    __m128i max_sat_mask = _mm_andnot_si128( sign_or, res );

    __m128 res_temp = _mm_blendv_ps(_mm_castsi128_ps( res ),
                                    _mm_castsi128_ps( int_min ),
                                    _mm_castsi128_ps( min_sat_mask ) );

    return _mm_castps_si128(_mm_blendv_ps( res_temp,
                                          _mm_castsi128_ps( int_max ),
                                          _mm_castsi128_ps( max_sat_mask ) ) );
}

void addSaturate(int32_t* bufferA, int32_t* bufferB, size_t numSamples)
{
    //
    // Load and add
    //
    __m128i* pSrc1 = (__m128i*)bufferA;
    __m128i* pSrc2 = (__m128i*)bufferB;

    for(int i=0; i<numSamples/4; ++i)
    {
        __m128i res = __mm_adds_epi32(*pSrc1, *pSrc2);
        _mm_store_si128(pSrc1, res);

        pSrc1++;
        pSrc2++;
    }
}