我在C ++中有一个简短的浮动转换,这是我的代码瓶颈。
代码从本机短路的硬件设备缓冲区转换,这代表来自花式光子计数器的输入。
float factor= 1.0f/value;
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
int value = source[i];//ushort -> int
destination[i] = value*factor;//int*float->float
}
一些细节
值应从0到2 ^ 16-1,它代表高灵敏度相机的像素值
我在配备i7处理器的多核x86机器上(i7 960是SSE 4.2和4.1)。
源与8位边界对齐(硬件设备的要求)
W * H总是可被8整除,大部分时间W和H可被8整除
这让我感到难过,有什么我可以做的吗?
我正在使用Visual Studios 2012 ...
答案 0 :(得分:10)
这是一个基本的SSE4.1实现:
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < W*H; i += 8)
{
// Load 8 16-bit ushorts.
// vi = {a,b,c,d,e,f,g,h}
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
// vi0 = {a,0,b,0,c,0,d,0}
// vi1 = {e,0,f,0,g,0,h,0}
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
这假定:
source
和destination
都与16个字节对齐。W*H
是8的倍数。通过进一步展开此循环可以做得更好。 (见下文)
这里的想法如下:
float
s。destination
。编辑:
我已经进行了一段时间的优化,所以我继续展开循环。
Core i7 920 @ 3.5 GHz
Visual Studio 2012 - 发布x64:
Original Loop : 4.374 seconds
Vectorize no unroll: 1.665
Vectorize unroll 2 : 1.416
进一步展开导致收益递减。
这是测试代码:
#include <smmintrin.h>
#include <time.h>
#include <iostream>
#include <malloc.h>
using namespace std;
void default_loop(float *destination,const short* source,float value,int size){
float factor = 1.0f / value;
for (int i = 0; i < size; i++)
{
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void print_sum(const float *destination,int size){
float sum = 0;
for (int i = 0; i < size; i++){
sum += destination[i];
}
cout << sum << endl;
}
int main(){
int size = 8000;
short *source = (short*)_mm_malloc(size * sizeof(short), 16);
float *destination = (float*)_mm_malloc(size * sizeof(float), 16);
for (int i = 0; i < size; i++){
source[i] = i;
}
float value = 1.1;
int iterations = 1000000;
clock_t start;
// Default Loop
start = clock();
for (int it = 0; it < iterations; it++){
default_loop(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
// Vectorize 8, no unroll
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll1(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
// Vectorize 8, unroll 2
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll2(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
_mm_free(source);
_mm_free(destination);
system("pause");
}
答案 1 :(得分:9)
我相信我有最好的答案。我的结果比神秘的快得多。它们只需要SSE2,但可以利用SSE3,SSE4,AVX甚至AVX2(如果有的话)。您不必更改任何代码。你只需要重新编译。
我跑了三个尺寸:8008,64000和2560 * 1920 = 4915200.我尝试了几种不同的版本。我列出了下面最重要的一些。函数vectorize8_unroll2
是神秘的功能。我做了一个名为vectorize8_unroll2_parallel
的改进版本。函数vec16_loop_unroll2_fix
和vec16_loop_unroll2_parallel_fix
是我的函数,我认为它比神秘的更好。如果使用AVX进行编译,这些函数将自动使用AVX,但在SSE4甚至SSE2上工作正常
另外,你写道“W * H总是可被8整除,大部分时间W和H可以被8整除”。
所以我们不能假设在所有情况下W * H都可被16整除。当大小不是16的倍数时,神秘的函数 vectorize8_unroll2
有一个错误(在他的代码中尝试大小= 8008,你会看到我的意思)。我的代码没有这样的错误。
我正在使用Ander Fog的vectorclass进行矢量化。它不是lib或dll文件。它只是一些头文件。我使用OpenMP进行并行化。以下是一些结果:
Intel Xeon E5630 @2.53GHz (supports upto SSE4.2)
size 8008, size2 8032, iterations 1000000
default_loop time: 7.935 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.878 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.253 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 1.151 seconds, diff 0.000000
size 64000, size2 64000, iterations 100000
default_loop time: 6.387 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.195 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.439 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.432 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 5.125 seconds, diff 0.000000
vectorize8_unroll2 time: 3.496 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 3.490 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 3.119 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 3.127 seconds, diff 0.000000
编辑:我在本回答结尾处使用GCC在AVX系统上添加了结果。
以下是代码。代码看起来很长,因为我做了很多交叉检查并测试了很多变化。从中下载vectorclass http://www.agner.org/optimize/#vectorclass。将头文件(vectorclass.h,instrset.h,vectorf128.h,vectorf256.h,vectorf256e.h,vectori128.h,vectori256.h,vectori256e.h)复制到编译目录中。在C ++ / CommandLine下添加/ D__SSE4_2__。在发布模式下编译。如果您有一个带AVX的CPU,那么请改为/ arch:AVX。在C ++专业/语言下添加OpenMP支持。
In GCC
SSE4.2: g++ foo.cpp -o foo_gcc -O3 -mSSE4.2 -fopenmp
AVX: g++ foo.cpp -o foo_gcc -O3 -mavx -fopenmp
在下面的代码中,函数vec16_loop_unroll2_parallel
要求数组为32的倍数。您可以将数组大小更改为32的倍数(即size2指的是)或者如果不可能,则可以使用没有这种限制的函数vec16_loop_unroll2_parallel_fix
。反正它一样快。
#include <stdio.h>
#include "vectorclass.h"
#include "omp.h"
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
inline void* aligned_malloc(size_t size, size_t align) {
void *result;
#ifdef _MSC_VER
result = _aligned_malloc(size, align);
#else
if(posix_memalign(&result, align, size)) result = 0;
#endif
return result;
}
inline void aligned_free(void *ptr) {
#ifdef _MSC_VER
_aligned_free(ptr);
#else
free(ptr);
#endif
}
void default_loop(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f/value;
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void default_loop_parallel(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f / value;
#pragma omp parallel for
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec8_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec8_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec16_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
for (; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for (; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec16_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_parallel_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
#pragma omp parallel for
for (int i=0; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for(int i = ROUND_DOWN(size, 32); i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void vectorize8_unroll1_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void copy_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
b[i] = a[i];
}
}
float compare_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
float diff = a[i] - b[i];
if(diff!=0) {
printf("i %d, a[i] %f, b[i] %f, diff %f\n", i, a[i], b[i], diff);
break;
}
sum += diff;
}
return sum;
}
void randomize_array(unsigned short* a, const int size) {
for(int i=0; i<size; i++) {
float r = (float)rand()/RAND_MAX;
a[i] = (int)(65536*r);
}
}
void run(int size, int iterations) {
int rd = ROUND_DOWN(size, 32);
int size2 = rd == size ? size : rd + 32;
float value = 1.1f;
printf("size %d, size2 %d, iterations %d\n", size, size2, iterations);
unsigned short* source = (unsigned short*)aligned_malloc(size2*sizeof(short), 16);
float* destination = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_old = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_ref = (float*)aligned_malloc(size2*sizeof(float), 16);
void (*fp[16])(float *destination, const unsigned short* source, float value, int size);
fp[0] = default_loop;
fp[1] = vec8_loop;
fp[2] = vec8_loop_unroll2;
fp[3] = vec16_loop;
fp[4] = vec16_loop_unroll2;
fp[5] = vec16_loop_unroll2_fix;
fp[6] = vectorize8_unroll1;
fp[7] = vectorize8_unroll2;
fp[8] = default_loop_parallel;
fp[9] = vec8_loop_parallel;
fp[10] = vec8_loop_unroll2_parallel;
fp[11] = vec16_loop_parallel;
fp[12] = vec16_loop_unroll2_parallel;
fp[13] = vec16_loop_unroll2_parallel_fix;
fp[14] = vectorize8_unroll1_parallel;
fp[15] = vectorize8_unroll2_parallel;
char* func_str[] = {"default_loop", "vec8_loop", "vec8_loop_unrool2", "vec16_loop", "vec16_loop_unroll2", "vec16_loop_unroll2_fix", "vectorize8_unroll1", "vectorize8_unroll2",
"default_loop_parallel", "vec8_loop_parallel", "vec8_loop_unroll2_parallel","vec16_loop_parallel", "vec16_loop_unroll2_parallel", "vec16_loop_unroll2_parallel_fix",
"vectorize8_unroll1_parallel", "vectorize8_unroll2_parallel"};
randomize_array(source, size2);
copy_arrays(destination_old, destination_ref, size);
fp[0](destination_ref, source, value, size);
for(int i=0; i<16; i++) {
copy_arrays(destination_old, destination, size);
double dtime = omp_get_wtime();
for (int it = 0; it < iterations; it++){
fp[i](destination, source, value, size);
}
dtime = omp_get_wtime() - dtime;
float diff = compare_arrays(destination, destination_ref, size);
printf("%40s time: %.3f seconds, diff %f\n", func_str[i], dtime, diff);
}
printf("\n");
aligned_free(source);
aligned_free(destination);
aligned_free(destination_old);
aligned_free(destination_ref);
}
int main() {
run(8008, 1000000);
run(64000, 100000);
run(2560*1920, 1000);
}
结果在具有AVX的系统上使用GCC。 GCC自动并行化循环(Visual Studio因短而失败,但如果你尝试int则可以工作)。手写矢量化代码获得的收益很少。但是,使用多个线程可以根据数组大小提供帮助。对于8008的小阵列大小,OpenMP会产生更糟糕的结果。但是,对于较大的数组,128000使用OpenMP可以提供更好的结果。对于最大的数组大小4915200,它完全受内存限制,而OpenMP没有帮助。
i7-2600k @ 4.4GHz
size 8008, size2 8032, iterations 1000000
default_loop time: 1.319 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.167 seconds, diff 0.000000
vectorize8_unroll2 time: 1.227 seconds, diff 0.000000
vec16_loop_unroll2_parallel time: 1.528 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.381 seconds, diff 0.000000
size 128000, size2 128000, iterations 100000
default_loop time: 2.902 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.838 seconds, diff 0.000000
vectorize8_unroll2 time: 2.844 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.706 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.672 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 2.313 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.309 seconds, diff 0.000000
vectorize8_unroll2 time: 2.318 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 2.353 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 2.349 seconds, diff 0.000000
答案 2 :(得分:5)
在我的机器上使用SSE内在函数[Quad Core Athlon,3.3GHz,16GB RAM]和g++ -O2
优化[1]可以提供大约2.5-3倍的速度。我还编写了一个函数来在内联汇编程序中执行相同的操作,但它并没有明显更快(再次,这适用于我的机器,随意在其他机器上运行)。
我尝试了各种尺寸的H * W,它们都给出了大致相同的结果。
[1]使用g++ -O3
为所有四个函数提供相同的时间,因为显然-O3
启用“自动矢量化代码”。因此,假设您的编译器支持类似的自动矢量化功能,那么整个事情就有点浪费时间。
<强>结果
convert_naive sum=4373.98 t=7034751 t/n=7.03475
convert_naive sum=4373.98 t=7266738 t/n=7.26674
convert_naive sum=4373.98 t=7006154 t/n=7.00615
convert_naive sum=4373.98 t=6815329 t/n=6.81533
convert_naive sum=4373.98 t=6820318 t/n=6.82032
convert_unroll4 sum=4373.98 t=8103193 t/n=8.10319
convert_unroll4 sum=4373.98 t=7276156 t/n=7.27616
convert_unroll4 sum=4373.98 t=7028181 t/n=7.02818
convert_unroll4 sum=4373.98 t=7074258 t/n=7.07426
convert_unroll4 sum=4373.98 t=7081518 t/n=7.08152
convert_sse_intrinsic sum=4373.98 t=3377290 t/n=3.37729
convert_sse_intrinsic sum=4373.98 t=3227018 t/n=3.22702
convert_sse_intrinsic sum=4373.98 t=3007898 t/n=3.0079
convert_sse_intrinsic sum=4373.98 t=3253366 t/n=3.25337
convert_sse_intrinsic sum=4373.98 t=5576068 t/n=5.57607
convert_sse_inlineasm sum=4373.98 t=3470887 t/n=3.47089
convert_sse_inlineasm sum=4373.98 t=2838492 t/n=2.83849
convert_sse_inlineasm sum=4373.98 t=2828556 t/n=2.82856
convert_sse_inlineasm sum=4373.98 t=2789052 t/n=2.78905
convert_sse_inlineasm sum=4373.98 t=3176522 t/n=3.17652
<强>代码强>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cstring>
#include <xmmintrin.h>
#include <emmintrin.h>
#define W 1000
#define H 1000
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
void convert_naive(short *source, float *destination)
{
float factor= 1.0f/32767;
for (int i = 0; i < W*H; i++)
{
int value = source[i];
destination[i] = value*factor;
}
}
void convert_unroll4(short *source, float *destination)
{
float factor= 1.0f/32767;
for (int i = 0; i < W*H; i+=4)
{
int v1 = source[i];
int v2 = source[i+1];
int v3 = source[i+2];
int v4 = source[i+3];
destination[i] = v1*factor;
destination[i+1] = v2*factor;
destination[i+2] = v3*factor;
destination[i+3] = v4*factor;
}
}
void convert_sse_intrinsic(short *source, float *destination)
{
__m128 factor = { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
__m64 zero1 = { 0,0 };
__m128i zero2 = { 0,0 };
__m64 *ps = reinterpret_cast<__m64 *>(source);
__m128 *pd = reinterpret_cast<__m128 *>(destination);
for (int i = 0; i < W*H; i+=4)
{
__m128i value = _mm_unpacklo_epi16(_mm_set_epi64(zero1, *ps), zero2);
value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
__m128 fval = _mm_cvtepi32_ps(value);
*pd = _mm_mul_ps(fval, factor); // destination[0,1,2,3] = value[0,1,2,3] * factor;
pd++;
ps++;
}
}
void convert_sse_inlineasm(short *source, float *destination)
{
__m128 factor = { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
__asm__ __volatile__(
"\t pxor %%xmm1, %%xmm1\n"
"\t movaps %3, %%xmm2\n"
"\t mov $0, %%rax\n"
"1:"
"\t movq (%1, %%rax), %%xmm0\n"
"\t movq 8(%1, %%rax), %%xmm3\n"
"\t movq 16(%1, %%rax), %%xmm4\n"
"\t movq 24(%1, %%rax), %%xmm5\n"
"\t punpcklwd %%xmm1, %%xmm0\n"
"\t pslld $16, %%xmm0\n"
"\t psrad $16, %%xmm0\n"
"\t cvtdq2ps %%xmm0, %%xmm0\n"
"\t mulps %%xmm2, %%xmm0\n"
"\t punpcklwd %%xmm1, %%xmm3\n"
"\t pslld $16, %%xmm3\n"
"\t psrad $16, %%xmm3\n"
"\t cvtdq2ps %%xmm3, %%xmm3\n"
"\t mulps %%xmm2, %%xmm3\n"
"\t punpcklwd %%xmm1, %%xmm4\n"
"\t pslld $16, %%xmm4\n"
"\t psrad $16, %%xmm4\n"
"\t cvtdq2ps %%xmm4, %%xmm4\n"
"\t mulps %%xmm2, %%xmm4\n"
"\t punpcklwd %%xmm1, %%xmm5\n"
"\t pslld $16, %%xmm5\n"
"\t psrad $16, %%xmm5\n"
"\t cvtdq2ps %%xmm5, %%xmm5\n"
"\t mulps %%xmm2, %%xmm5\n"
"\t movaps %%xmm0, (%0, %%rax, 2)\n"
"\t movaps %%xmm3, 16(%0, %%rax, 2)\n"
"\t movaps %%xmm4, 32(%0, %%rax, 2)\n"
"\t movaps %%xmm5, 48(%0, %%rax, 2)\n"
"\t addq $32, %%rax\n"
"\t cmpq %2, %%rax\n"
"\t jbe 1b\n"
: /* no outputs */
: "r" (destination), "r" (source), "i"(sizeof(*source) * H * W), "m"(factor):
"rax", "xmm0", "xmm1", "xmm3");
}
short inbuffer[W * H] __attribute__ ((aligned (16)));
float outbuffer[W * H + 16] __attribute__ ((aligned (16)));
#ifdef DEBUG
float outbuffer2[W * H];
#endif
typedef void (*func)(short *source, float *destination);
struct BmEntry
{
const char *name;
func fn;
};
void bm(BmEntry& e)
{
memset(outbuffer, 0, sizeof(outbuffer));
unsigned long long t = rdtsc();
e.fn(inbuffer, outbuffer);
t = rdtsc() - t;
float sum = 0;
for(int i = 0; i < W * H; i++)
{
sum += outbuffer[i];
}
#if DEBUG
convert_naive(inbuffer, outbuffer2);
for(int i = 0; i < W * H; i++)
{
if (outbuffer[i] != outbuffer2[i])
{
std::cout << i << ":: " << inbuffer[i] << ": "
<< outbuffer[i] << " != " << outbuffer2[i]
<< std::endl;
}
}
#endif
std::cout << std::left << std::setw(30) << e.name << " sum=" << sum << " t=" << t <<
" t/n=" << (double)t / (W * H) << std::endl;
}
#define BM(x) { #x, x }
BmEntry table[] =
{
BM(convert_naive),
BM(convert_unroll4),
BM(convert_sse_intrinsic),
BM(convert_sse_inlineasm),
};
int main()
{
for(int i = 0; i < W * H; i++)
{
inbuffer[i] = (short)i;
}
for(int i = 0; i < sizeof(table)/sizeof(table[i]); i++)
{
for(int j = 0; j < 5; j++)
bm(table[i]);
}
return 0;
}
答案 3 :(得分:2)
不确定循环中的条件表达式是否仅计算一次。 你可以尝试:
float factor= 1.0f/value;
for (int i = 0, count = W*H; i < count; ++i)//25% of time is spent doing this
{
int value = source[i];//short -> int
destination[i] = value*factor;//int->float
}
答案 4 :(得分:2)
这不是一个有效的答案,不要把它当成它,但我实际上想知道代码如何通过使用256k查找表来表现。 (基本上是一个'短到浮动'表,包含65536个条目)。
我认为CoreI7有大约8兆字节的缓存,因此查找表适合数据缓存。
我真的很想知道这会如何影响表现:)
答案 5 :(得分:2)
并且您可以使用OpenMP来雇用CPU的每个核心,只需执行以下操作即可:
#include <omp.h>
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
int value = source[i];//ushort -> int
destination[i] = value*factor;//int*float->float
}
这是基于以前程序的结果,只需添加如下:
#pragma omp parallel for
for (int it = 0; it < iterations; it++){
...
}
然后是结果
beta@beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -fopenmp
beta@beta-PC ~
$ opt
0.748
2.90873e+007
0.484
2.90873e+007
0.796
2.90873e+007
beta@beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -O3
beta@beta-PC ~
$ opt
1.404
2.90873e+007
1.404
2.90873e+007
1.404
2.90873e+007
。
结果显示openmp提高了100%。 Visual C ++也支持openmp。
答案 6 :(得分:1)
您可以尝试近似表达式
float factor = 1.0f/value;
由numerator/denomitator
和numerator
denominator
组成的分数int
。这可以在您的应用程序中完成所需的精度,如
int denominator = 10000;
int numerator = factor * denominator;
然后你可以在整数算术中进行计算,比如
int value = source[i];
destination[i] = (value * numerator) / numerator;
您必须处理溢出,或许您需要切换到{64}系统上的long
(甚至long long
)进行计算。