我有以下代码剪切,表示我的应用程序中的瓶颈:
double theta = acos(d);
double a = cos( theta*one_third );
double b = cos( theta*one_third + M_PI_23 );
double c = cos( theta*one_third + M_PI_43 );
其中one_third=1.0/3.0
,M_PI_23=M_PI*2.0/3.0
和M_PI_43=M_PI*4.0/3.0
。这包含在CUDA代码中,尽管x86中也存在同样的问题。
任何人都知道上述任何明智的简化,以便我可以避免acos
来电和/或后续cos
来电?总之,它们代表了90%的计算时间,单个acos
调用与三个cos
调用一样昂贵。
THX
答案 0 :(得分:3)
trig:cos(A + B)= cosAcosB - sinAsinB
所以改变这个:
double a = cos( theta*one_third );
到this:
double as, ac;
sincos( theta*one_third, &as, &ac );
然后您可以将a
,b
和c
计算为:
double a = ac;
double b = ac*cos(M_PI_23) - as*sin(M_PI_23);
double c = ac*cos(M_PI_43) - as*sin(M_PI_43);
当然,如果可能的话,你应该用编译时常量替换cos(M_PI_xx)
和sin(M_PI_xx)
。编译器可能会想出来,但可能没有。
这是一个有效的例子,证明了这种行为占主导地位的代码大约加快了30%(快了1.3倍):
$ cat t874.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define one_third (1.0/3.0)
// #define M_PI 3.141592654
#define M_PI_23 (M_PI*2.0/3.0)
#define M_PI_43 (M_PI*4.0/3.0)
#define DSIZE 65536
#define nTPB 256
#define NL 100
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void tk(double *d, const double smp23, const double cmp23, const double smp43, const double cmp43, const int dsize){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < dsize){
double theta = acos(d[idx]);
#ifndef USE_I
d[idx+dsize] = cos( theta*one_third );
d[idx+2*dsize] = cos( theta*one_third + M_PI_23 );
d[idx+3*dsize] = cos( theta*one_third + M_PI_43 );
#else
double as, ac;
sincos(theta*one_third, &as, &ac);
d[idx+dsize] = ac;
d[idx+2*dsize] = ac*cmp23 - as*smp23;
d[idx+3*dsize] = ac*cmp43 - as*smp43;
#endif
}
}
int main(){
double *h_d, *d_d;
cudaMalloc(&d_d, 7*DSIZE*sizeof(double));
h_d = (double *)malloc(7*DSIZE*sizeof(double));
double smp23 = sin(M_PI_23);
double cmp23 = cos(M_PI_23);
double smp43 = sin(M_PI_43);
double cmp43 = cos(M_PI_43);
for (int i = 0; i < DSIZE; i++)
h_d[i] = rand()/(double)RAND_MAX;
cudaMemcpy(d_d, h_d, DSIZE*sizeof(double), cudaMemcpyHostToDevice);
unsigned long long gtime = dtime_usec(0);
for (int i = 0; i < NL; i++)
tk<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_d, smp23, cmp23, smp43, cmp43, DSIZE);
cudaDeviceSynchronize();
gtime = dtime_usec(gtime);
cudaCheckErrors("some error");
printf("elapsed time: %fs\n", gtime/(float)(USECPSEC*NL));
return 0;
}
$ nvcc -O3 t874.cu -o t874
$ ./t874
elapsed time: 0.000078s
$ nvcc -O3 -DUSE_I t874.cu -o t874
$ ./t874
elapsed time: 0.000060s
$
Fedora 20,CUDA 7.5RC,Quadro5000 GPU。