有人建议使用intel intrinsics(AVX,SSE4 ......)快速添加饱和32位有符号整数吗?
我查看了内在指南并找到了_mm256_adds_epi16,但这似乎只增加了16位整数。我没有看到任何类似的32位。其他电话似乎四处传播。
提示
答案 0 :(得分:2)
以下情况(且仅在以下情况下)会发生签名溢出:
使用C操作员:overflow = ~(a^b) & (a^(a+b))
。
此外,如果发生溢出,则饱和结果将具有与任一输入相同的符号。使用@PeterCordes建议的int_min = int_max+1
技巧,并假设您至少具有SSE4.1(对于blendvps
),可以实现为:
__m128i __mm_adds_epi32( __m128i a, __m128i b )
{
const __m128i int_max = _mm_set1_epi32( 0x7FFFFFFF );
// normal result (possibly wraps around)
__m128i res = _mm_add_epi32( a, b );
// If result saturates, it has the same sign as both a and b
__m128i sign_bit = _mm_srli_epi32(a, 31); // shift sign to lowest bit
__m128i saturated = _mm_add_epi32(int_max, sign_bit);
// saturation happened if inputs do not have different signs,
// but sign of result is different:
__m128i sign_xor = _mm_xor_si128( a, b );
__m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a,res));
return _mm_castps_si128(_mm_blendv_ps( _mm_castsi128_ps(saturated),
_mm_castsi128_ps( res ),
_mm_castsi128_ps( overflow ) ) );
}
如果您的blendvps
快于(或快于)班次和加法(还考虑端口使用情况),那么您当然可以将int_min
和int_max
与a
的符号位。
此外,如果只有SSE2或SSE3,则可以通过向右移31位(overflow
)算术移位(__m256i
)和手动混合(使用and / andand / or)来替换最后一个混合。
自然,使用AVX2,它可以使用__m128i
变量而不是a
(应该很容易重写)。
附录如果在编译时知道b
或saturated
的符号,则可以直接设置_mm_xor_si128
,并保存两个overflow
计算,即对于正值_mm_andnot_si128(b, res)
,a
将是_mm_andnot(res, b)
,对于负值a
,res = a+b
将是public class FeedMessageValue
{
public string Username { get; set; }
public string SubscriptionID { get; set; }
public DateTime MessageTime { get; set; }
}
public class FeedMessageData : IMailData
{
private FeedMessageValue feedMessageValue;
public FeedMessageData(string username, string subscriptionID, DateTime messageTime)
{
this.feedMessageValue = new FeedMessageValue
{
Username = username
, SubscriptionID = subscriptionID
, MessageTime = messageTime
};
PropertyInfo[] infos = this.feedMessageValue.GetType().GetProperties();
foreach (PropertyInfo info in infos)
{
this.getMergeValues().Add(info.Name, info.GetValue(this.feedMessageValue, null).ToString());
}
}
public Dictionary<string, string> getMergeValues()
{
return new Dictionary<string, string>();
}
}
(使用public interface IMailData
{
Dictionary<string, string> getMergeValues();
}
public interface IEmailGenerator
{
MailMessage generateEmail(IMailData mailData, string htmlTemplate, string textTemplate);
}
public class EmailGenerator : IEmailGenerator, IRegisterInIoC
{
// Setup the rules
static readonly Regex emailRegex = new Regex(@"\$([\w\-\,\.]+)\$", RegexOptions.Compiled);
private string mergeTemplate(string template, IReadOnlyDictionary<string, string> values)
{
string emailTextData = emailRegex.Replace(template, match => values[match.Groups[1].Value]);
return emailTextData;
}
public MailMessage generateEmail(IMailData mailData, string htmlTemplate, string textTemplate)
{
// MailMessage
}
}
)。
答案 1 :(得分:1)
这是一个适用于 SSE2 的版本,改进了 SSE4.1 (_mm_blendv_ps
)、AVX-512VL (_mm_ternarylogic_epi32
) 和 AVX-512DQ(_mm_movepi32_mask
,在 Peter科德斯的建议)。
__m128i __mm_adds_epi32( __m128i a, __m128i b) {
const __m128i int_max = _mm_set1_epi32(INT32_MAX);
/* normal result (possibly wraps around) */
const __m128i res = _mm_add_epi32(a, b);
/* If result saturates, it has the same sign as both a and b */
const __m128i sign_bit = _mm_srli_epi32(a, 31); /* shift sign to lowest bit */
#if defined(__AVX512VL__)
const __m128i overflow = _mm_ternarylogic_epi32(a, b, res, 0x42);
#else
const __m128i sign_xor = _mm_xor_si128(a, b);
const __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res));
#endif
#if defined(__AVX512DQ__) && defined(__AVX512VL__)
return _mm_mask_add_epi32(res, _mm_movepi32_mask(overflow), int_max, sign_bit);
#else
const __m128i saturated = _mm_add_epi32(int_max, sign_bit);
#if defined(__SSE4_1__)
return
_mm_castps_si128(
_mm_blendv_ps(
_mm_castsi128_ps(res),
_mm_castsi128_ps(saturated),
_mm_castsi128_ps(overflow)
)
);
#else
const __m128i overflow_mask = _mm_srai_epi32(overflow, 31);
return
_mm_or_si128(
_mm_and_si128(overflow_mask, saturated),
_mm_andnot_si128(overflow_mask, res)
);
#endif
#endif
}
我为 SIMDe 的 NEON vqaddq_s32
(和 MSA __msa_adds_s_b
)的实现做了这个;如果您需要其他版本,您应该能够从 simde/arm/neon/qadd.h 改编它们。对于 128 位向量,除了 SSE 支持的(8/16 位,有符号和无符号)之外,还有:
vaddq_s32
(想想_mm_adds_epi32
)vaddq_s64
(想想_mm_adds_epi64
)vaddq_u32
(想想_mm_adds_epu32
)vaddq_u64
(想想 _mm_adds_epu64
)也存在,但目前依赖于向量扩展。我可以(也可能应该)将生成的代码移植到内部函数中,但 TBH 我不确定如何改进它,所以我没有打扰。
答案 2 :(得分:0)
此链接回答了这个问题:
https://software.intel.com/en-us/forums/topic/285219
以下是一个示例实现:
#include <immintrin.h>
__m128i __inline __mm_adds_epi32( __m128i a, __m128i b )
{
static __m128i int_min = _mm_set1_epi32( 0x80000000 );
static __m128i int_max = _mm_set1_epi32( 0x7FFFFFFF );
__m128i res = _mm_add_epi32( a, b );
__m128i sign_and = _mm_and_si128( a, b );
__m128i sign_or = _mm_or_si128( a, b );
__m128i min_sat_mask = _mm_andnot_si128( res, sign_and );
__m128i max_sat_mask = _mm_andnot_si128( sign_or, res );
__m128 res_temp = _mm_blendv_ps(_mm_castsi128_ps( res ),
_mm_castsi128_ps( int_min ),
_mm_castsi128_ps( min_sat_mask ) );
return _mm_castps_si128(_mm_blendv_ps( res_temp,
_mm_castsi128_ps( int_max ),
_mm_castsi128_ps( max_sat_mask ) ) );
}
void addSaturate(int32_t* bufferA, int32_t* bufferB, size_t numSamples)
{
//
// Load and add
//
__m128i* pSrc1 = (__m128i*)bufferA;
__m128i* pSrc2 = (__m128i*)bufferB;
for(int i=0; i<numSamples/4; ++i)
{
__m128i res = __mm_adds_epi32(*pSrc1, *pSrc2);
_mm_store_si128(pSrc1, res);
pSrc1++;
pSrc2++;
}
}