我正在尝试使用gcc 4.8.2编译以下代码,如果我使用g++ -mavx2 -O0 10bit.cpp
编译它,我会从time命令获得以下输出:
真实的0m0.117s
用户0m0.116s
sys 0m0.000s
但是当我启用优化g++ -mavx2 -O3 10bit.cpp
时,time命令的输出显示更长的执行时间:
真实的0m0.164s
用户0m0.164s
sys 0m0.000s
我的CPU型号名称为:Intel(R) Xeon(R) CPU E5-2667 v3 @ 3.20GHz
,支持AVX2
。此外,如果我尝试SSE4.1
指令而不是AVX2
,我的程序完成得更快。
有人可以解释一下吗?
#include <stdint.h>
#include "immintrin.h"
uint32_t int_values[8] __attribute__ ((__aligned__(32)));
unsigned char buf[] __attribute__ ((__aligned__(32))) = {
0xFF, 0x9E, 0x8D, 0xCC, 0xBB, 0xAA, 0x99, 0x88, 0x77, 0x66, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
0xFF, 0x9E, 0x8D, 0xCC, 0xBB, 0xAA, 0x99, 0x88, 0x77, 0x66, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
0xFF, 0x9E, 0x8D, 0xCC, 0xBB, 0xAA, 0x99, 0x88, 0x77, 0x66, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
0xFF, 0x9E, 0x8D, 0xCC, 0xBB, 0xAA, 0x99, 0x88, 0x77, 0x66, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
0xFF, 0x9E, 0x8D, 0xCC, 0xBB, 0xAA, 0x99, 0x88, 0x77, 0x66, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
0xFF, 0x9E, 0x8D, 0xCC, 0xBB, 0xAA, 0x99, 0x88, 0x77, 0x66, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
0xFF, 0x9E, 0x8D, 0xCC, 0xBB, 0xAA, 0x99, 0x88, 0x77, 0x66, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
0xFF, 0x9E, 0x8D, 0xCC, 0xBB, 0xAA, 0x99, 0x88, 0x77, 0x66, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21
};
unsigned char out[180] __attribute__ ((__aligned__(32)));
__m256i *__m_int_vals = (__m256i *)int_values;
int main() {
for(int c = 0; c < 204800; c++) {
for(int i = 0, j=0; i < sizeof(buf); i+=40, j+=32) {
uint8_t *b = &buf[i];
(* __m_int_vals) = _mm256_set_epi8(
b[35], b[36], b[37], b[38],
b[31], b[32], b[33], b[34],
b[26], b[27], b[28], b[29],
b[21], b[22], b[23], b[24],
b[16], b[17], b[18], b[19],
b[11], b[12], b[13], b[14],
b[6] , b[7] , b[8] , b[9],
b[1] , b[2] , b[3] , b[4]
);
out[j] = b[0];
out[j+4] = b[5];
out[j+8] = b[10];
out[j+12] = b[15];
out[j+16] = b[20];
out[j+20] = b[25];
out[j+24] = b[30];
out[j+28] = b[35];
(* __m_int_vals) = _mm256_srli_epi32((*__m_int_vals), 2);
out[j+3] = int_values[0];
out[j+7] = int_values[1];
out[j+11] = int_values[2];
out[j+15] = int_values[3];
out[j+19] = int_values[4];
out[j+23] = int_values[5];
out[j+27] = int_values[6];
out[j+31] = int_values[7];
(* __m_int_vals) = _mm256_srli_epi32((*__m_int_vals), 10);
out[j+2] = int_values[0];
out[j+6] = int_values[1];
out[j+10] = int_values[2];
out[j+14] = int_values[3];
out[j+18] = int_values[4];
out[j+22] = int_values[5];
out[j+26] = int_values[6];
out[j+30] = int_values[7];
(* __m_int_vals) = _mm256_srli_epi32((*__m_int_vals), 10);
out[j+1] = int_values[0];
out[j+5] = int_values[1];
out[j+9] = int_values[2];
out[j+13] = int_values[3];
out[j+17] = int_values[4];
out[j+21] = int_values[5];
out[j+25] = int_values[6];
out[j+29] = int_values[7];
}
}
}