Question

假设您的任务导致每个浮点子注册中的小计。我没有看到将小计归结为一个浮点总数的指令。我是否需要将MM寄存器存储在普通的旧存储器中，然后用简单的指令进行求和？

（无论这些是双精度还是单精度都没有解决，如果我能找到操作码，我计划编码每个CPU变化到即将到来的（？）512位AVX版本。）< / p>

Answer 1

wget http://www.agner.org/optimize/vectorclass.zip
unzip vectorclass.zip -d vectorclass
cd vectorclass/

此代码为GPLv3。

SSE

grep -A11 horizontal_add vectorf128.h

static inline float horizontal_add (Vec4f const & a) {
#if  INSTRSET >= 3  // SSE3
    __m128 t1 = _mm_hadd_ps(a,a);
    __m128 t2 = _mm_hadd_ps(t1,t1);
    return _mm_cvtss_f32(t2);        
#else
    __m128 t1 = _mm_movehl_ps(a,a);
    __m128 t2 = _mm_add_ps(a,t1);
    __m128 t3 = _mm_shuffle_ps(t2,t2,1);
    __m128 t4 = _mm_add_ss(t2,t3);
    return _mm_cvtss_f32(t4);
#endif
--
static inline double horizontal_add (Vec2d const & a) {
#if  INSTRSET >= 3  // SSE3
    __m128d t1 = _mm_hadd_pd(a,a);
    return _mm_cvtsd_f64(t1);        
#else
    __m128  t0 = _mm_castpd_ps(a);
    __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
    __m128d t2 = _mm_add_sd(a,t1);
    return _mm_cvtsd_f64(t2);
#endif
}

AVX

grep -A6 horizontal_add vectorf256.h

static inline float horizontal_add (Vec8f const & a) {
    __m256 t1 = _mm256_hadd_ps(a,a);
    __m256 t2 = _mm256_hadd_ps(t1,t1);
    __m128 t3 = _mm256_extractf128_ps(t2,1);
    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
    return _mm_cvtss_f32(t4);        
}
--
static inline double horizontal_add (Vec4d const & a) {
    __m256d t1 = _mm256_hadd_pd(a,a);
    __m128d t2 = _mm256_extractf128_pd(t1,1);
    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
    return _mm_cvtsd_f64(t3);        
}

AVX512

grep -A3 horizontal_add vectorf512.h

static inline float horizontal_add (Vec16f const & a) {
#if defined(__INTEL_COMPILER)
    return _mm512_reduce_add_ps(a);
#else
    return horizontal_add(a.get_low() + a.get_high());
#endif
}

--
static inline double horizontal_add (Vec8d const & a) {
#if defined(__INTEL_COMPILER)
    return _mm512_reduce_add_pd(a);
#else
    return horizontal_add(a.get_low() + a.get_high());
#endif
}

get_high()和get_low()

Vec8f get_high() const {
    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(zmm),1));
}
Vec8f get_low() const {
    return _mm512_castps512_ps256(zmm);
}

Vec4d get_low() const {
    return _mm512_castpd512_pd256(zmm);
}

Vec4d get_high() const {
    return _mm512_extractf64x4_pd(zmm,1);
}

对于整数，在vectori128.h，vectori256.h和vectori512.h中查找horizontal_add。

您也可以直接使用Vector Class Library（VCL）

#include <stdio.h>
#define MAX_VECTOR_SIZE 512
#include "vectorclass.h"

int main(void) {

  float x[16]; for(int i=0;i<16;i++) x[i]=i+1;
  Vec4f  v4  =  Vec4f().load(x);
  Vec8f  v8  =  Vec8f().load(x);
  Vec16f v16 = Vec16f().load(x);

  printf("%f %d\n", horizontal_add(v4), 4*5/2);
  printf("%f %d\n", horizontal_add(v8), 8*9/2);
  printf("%f %d\n", horizontal_add(v16), 16*17/2);
}

像这样编译（GCC只有我的KNL对于AVX512来说太旧了）

SSE2:     g++  -O3 test.cpp
AVX:      g++  -O3 -mavx test.cpp
AVX512ER: icpc -O3 -xMIC-AVX512 test.cpp

输出

10.000000 10
36.000000 36
136.000000 136

VCL库的一个好处是，如果您使用例如Vec8f系统只有SSE2，它将使用SSE模拟AVX两次。

参见＆＃34;指令集和CPU调度＆＃34;在vectorclass.pdf手册中，了解如何使用MSVC，ICC，Clang和GCC编译不同的指令集。

Answer 2

我为AVX2实现了以下内联函数。它汇总所有元素并返回结果。您可以将此视为一个建议答案，以便为此目的开发自己的功能。

注意：_mm256_extract_epi32未针对AVX展示，您可以使用自己的方法vmovss代替float _mm256_cvtss_f32 (__m256 a)，并开发水平添加功能。

// my horizontal addition of epi32
inline int _mm256_hadd2_epi32(__m256i a)
{
    __m256i a_hi;
    a_hi = _mm256_permute2x128_si256(a, a, 1); //maybe it should be 4 
    a = _mm256_hadd_epi32(a, a_hi);
    a = _mm256_hadd_epi32(a, a);
    a = _mm256_hadd_epi32(a, a);
    return _mm256_extract_epi32(a,0);
}

如何对SSE XMM，AVX YMM和ZMM寄存器中的所有32位或64位子寄存器求和？

2 个答案: