Question

我有以下代码剪切，表示我的应用程序中的瓶颈：

double theta = acos(d);
double a = cos( theta*one_third );
double b = cos( theta*one_third + M_PI_23 );
double c = cos( theta*one_third + M_PI_43 );

其中one_third=1.0/3.0，M_PI_23=M_PI*2.0/3.0和M_PI_43=M_PI*4.0/3.0。这包含在CUDA代码中，尽管x86中也存在同样的问题。

任何人都知道上述任何明智的简化，以便我可以避免acos来电和/或后续cos来电？总之，它们代表了90％的计算时间，单个acos调用与三个cos调用一样昂贵。

THX

Answer 1

trig：cos（A + B）= cosAcosB - sinAsinB

所以改变这个：

double a = cos( theta*one_third );

到this：

double as, ac;
sincos( theta*one_third, &as, &ac );

然后您可以将a，b和c计算为：

double a = ac;
double b = ac*cos(M_PI_23) - as*sin(M_PI_23);
double c = ac*cos(M_PI_43) - as*sin(M_PI_43);

当然，如果可能的话，你应该用编译时常量替换cos(M_PI_xx)和sin(M_PI_xx)。编译器可能会想出来，但可能没有。

这是一个有效的例子，证明了这种行为占主导地位的代码大约加快了30％（快了1.3倍）：

$ cat t874.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define one_third (1.0/3.0)
// #define M_PI 3.141592654
#define M_PI_23 (M_PI*2.0/3.0)
#define M_PI_43 (M_PI*4.0/3.0)
#define DSIZE 65536
#define nTPB 256
#define NL 100

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__global__ void tk(double *d, const double smp23, const double cmp23, const double smp43, const double cmp43, const int dsize){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < dsize){
    double theta = acos(d[idx]);
#ifndef USE_I
    d[idx+dsize]   = cos( theta*one_third );
    d[idx+2*dsize] = cos( theta*one_third + M_PI_23 );
    d[idx+3*dsize] = cos( theta*one_third + M_PI_43 );
#else
    double as, ac;
    sincos(theta*one_third, &as, &ac);
    d[idx+dsize]   = ac;
    d[idx+2*dsize] = ac*cmp23 - as*smp23;
    d[idx+3*dsize] = ac*cmp43 - as*smp43;
#endif
  }
}

int main(){

  double *h_d, *d_d;
  cudaMalloc(&d_d, 7*DSIZE*sizeof(double));
  h_d = (double *)malloc(7*DSIZE*sizeof(double));

  double smp23 = sin(M_PI_23);
  double cmp23 = cos(M_PI_23);
  double smp43 = sin(M_PI_43);
  double cmp43 = cos(M_PI_43);
  for (int i = 0; i < DSIZE; i++)
    h_d[i] = rand()/(double)RAND_MAX;
  cudaMemcpy(d_d, h_d, DSIZE*sizeof(double), cudaMemcpyHostToDevice);
  unsigned long long gtime = dtime_usec(0);
  for (int i = 0; i < NL; i++)
    tk<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_d, smp23, cmp23, smp43, cmp43, DSIZE);
  cudaDeviceSynchronize();
  gtime = dtime_usec(gtime);
  cudaCheckErrors("some error");
  printf("elapsed time: %fs\n", gtime/(float)(USECPSEC*NL));
  return 0;
}
$ nvcc -O3 t874.cu -o t874
$ ./t874
elapsed time: 0.000078s
$ nvcc -O3 -DUSE_I t874.cu -o t874
$ ./t874
elapsed time: 0.000060s
$

Fedora 20，CUDA 7.5RC，Quadro5000 GPU。

三角优化和简化

1 个答案: