CUDA中的二分法

时间:2013-05-17 20:38:23

标签: cuda parallel-processing kernel gpu bisection

我试图在CUDA中实现 Bisection Method 。该方法能够近似来自应用程序的特征值(Bisection Method)。我有一些关于如何做的问题。 这是我的代码:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

double f(double x)
{
    //return ((5*sin(2*x))-(52*cos(2*x)))+50;
    return cos(x);
}

double absoluto(double n){
    if(n<0)  n=n*-1; 
    return(n);
}

// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n)
{
    int id = blockIdx.x*blockDim.x+threadIdx.x;
    if (id < n)
        c[id] = (a[id] + b[id])/2;
}

int main( int argc, char* argv[] )
{
    int i=0;
    double malla = 1.0;
    double x1=0.0 , x2=10.0 , j=0.0;

    int n = (int)x2/(int)malla;

double *host_a;
double *host_b;
double *host_c;

double *dev_a;
double *dev_b;
double *dev_c;

size_t bytes = n*sizeof(double);

host_a = (double*)malloc(bytes);
host_b = (double*)malloc(bytes);
host_c = (double*)malloc(bytes);


cudaMalloc(&dev_a, bytes);
cudaMalloc(&dev_b, bytes);
cudaMalloc(&dev_c, bytes);



// Initialize vectors on host
for( j = 0.0; j < n; j=j+1.0 ) {
    if((f(x1)*f(x1+malla))>0){
            x1 = x1 + malla;
            i++;
    }
    else{
            host_a[i] = x1;
            host_b[i] = x1+malla;
            x1 = x1 + malla;
            i++;
    }
}

int blockSize, gridSize;
blockSize = 1024;
gridSize = (int)ceil((float)n/blockSize);

i=0;

// Copy host vectors to device
cudaMemcpy( dev_a, host_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, host_b, bytes, cudaMemcpyHostToDevice);

// Execute the kernel
biseccion<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, n);
// Copy array back to host
cudaMemcpy( host_c, dev_c, bytes, cudaMemcpyDeviceToHost );

i=0;
for(j=0.0;j<n;j++){
    printf("%f\n",host_c[i])
    i++;
}

// Release device memory
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);

// Release host memory
free(host_a);
free(host_b);
free(host_c);

return 0;
}

上面的代码只返回第一个aproximation,所以我必须添加一些比较。例如:

如果ca的产品为负数,则ab的新值为ac;如果没有,新值将为bc用于内核的下一次迭代,当然这必须是循环。

我的第一个问题是如何在内核中执行循环以继续近似?

其次,我如何实现if来比较内核上的值?

最后,像if等控制结构会降低性能吗?

2 个答案:

答案 0 :(得分:3)

您需要定义停止标准或收敛标准 - 何时停止近似?让我们假设你的停止标准只是二分循环的一些迭代。我们可以将它作为参数传递给内核。

然后我们可以像这样重写内核:

// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n, int loopcnt)
{
    int id = blockIdx.x*blockDim.x+threadIdx.x;
    int loops = 0;
    if (id < n)
      while (loops < loopcnt){
        c[id] = (a[id] + b[id])/2;
        if ((f(c[id]) * f(a[id])) < 0) b[id] = c[id];
        else a[id] = c[id];
        loops++;
        }

}

我认为如果你研究我对内核所做的更改,你会发现它几乎就像在普通的C / C ++代码中编写它一样。

为了使上面的内核工作,我们必须告诉编译器我们希望f(x)函数可以在主机或设备上使用,我们使用__host__ __device__装饰器执行此操作:< / p>

__host__ __device__ double f(double x)
{
    //return ((5*sin(2*x))-(52*cos(2*x)))+50;
    return cos(x);
}

请注意,上述更改未得到特别优化。例如,我们存储在全局内存中的变量有相当多的重复使用,例如a[id]b[id]c[id]。我们可能会使用共享内存(或者甚至只是本地线程变量 - 并不多),只有在我们完成循环时才将结果写回全局内存。

我必须对您的代码进行一些其他更改,以便以对我有意义的方式使用。以下是代码的完整修改版本:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

__host__ __device__ double f(double x)
{
    //return ((5*sin(2*x))-(52*cos(2*x)))+50;
    return cos(x);
}

double absoluto(double n){
    if(n<0)  n=n*-1;
    return(n);
}

// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n, int loopcnt)
{
    int id = blockIdx.x*blockDim.x+threadIdx.x;
    int loops = 0;
    if (id < n)
      while (loops < loopcnt){
        c[id] = (a[id] + b[id])/2;
        if ((f(c[id]) * f(a[id])) < 0) b[id] = c[id];
        else a[id] = c[id];
        loops++;
        }

}

int main( int argc, char* argv[] )
{
    int i=0;
    int loops=1000;  // this is the number of bisection iterations to run
    double malla = 1.0;
    double x1=0.0 , x2=10.0 , j=0.0;

    int n = (int)x2/(int)malla;

double *host_a;
double *host_b;
double *host_c;

double *dev_a;
double *dev_b;
double *dev_c;

size_t bytes = n*sizeof(double);

host_a = (double*)malloc(bytes);
host_b = (double*)malloc(bytes);
host_c = (double*)malloc(bytes);


cudaMalloc(&dev_a, bytes);
cudaMalloc(&dev_b, bytes);
cudaMalloc(&dev_c, bytes);


// Initialize vectors on host
while( i < n) {
    if((f(x1)*f(x1+malla))>0){
            x1 = x1 + malla;
    }
    else{
            host_a[i] = x1;
            host_b[i] = x1+malla;
            x1 = x1 + malla;
            i++;
    }
}

int blockSize, gridSize;
blockSize = 256;
gridSize = (int)ceil((float)n/blockSize);

i=0;

// Copy host vectors to device
cudaMemcpy( dev_a, host_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, host_b, bytes, cudaMemcpyHostToDevice);

// Execute the kernel
biseccion<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, n, loops);
// Copy array back to host
cudaMemcpy( host_c, dev_c, bytes, cudaMemcpyDeviceToHost );

i=0;
for(j=0.0;j<n;j++){
    printf("%f\n",host_c[i]);
    i++;
}

// Release device memory
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);

// Release host memory
free(host_a);
free(host_b);
free(host_c);

return 0;
}

其他几点说明:

  • 我将块大小从1024更改为256.我这样做的原因是为了避免有足够的寄存器来运行内核。有关此问题的其他说明,请查看here。这并没有真正影响任何事情。
  • 您应该对所有cuda api调用和所有内核调用执行cuda error checking
  • 我对你的逻辑做了一些修改,用于设置初始对分起点。你的方法对我没有意义(跳过了一些二分范围)。
  • 我添加了loops变量来定义要运行多少个二分循环。

当我运行代码时,我得到这样的结果:

1.570796
4.712389
7.853982
10.995574
14.137167
17.278760
20.420352
23.561945
26.703538
29.845130

你会注意到第一个结果是pi / 2,并且每个后续结果都添加了pi,所以我认为这是cos(x)的前10个根的正确结果。

答案 1 :(得分:1)

Robert Crovella已经指出你的问题是停止规则,它是根据迭代次数给出的。

对于最简单的更复杂的二分法,停止规则也可以与目标精度相关。下面我将在CUDA中提供一个版本的二分法,改编自 C ++中的数值配方一书中提供的版本,您也可以通过该版本设置目标精度。

也许,通过调整eigenvalues CUDA SDK样本中利用的二分内核,可以获得计算上更复杂的二分法。

该方法的新版本似乎更准确。下面是一些结果:

No target accuracy

1.571289062500
4.453613281250
6.504882812500
10.546875000000
13.171386718750

Target accuracy

1.570796326795
4.712388980385
7.853981633975
10.995574287564
14.137166941154

Actual roots

1.570796326794897
4.712388980384690
7.853981633974483
10.995574287564276
14.137166941154069

可以通过上述书中提供的更好的初始包围再次实现。

这是代码

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <math_constants.h>

#define BLOCKSIZE 512

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/************************************/
/* FUNCTION TO SEARCH THE ROOTS FOR */
/************************************/
__host__ __device__ double f(double x)
{
    //return ((5*sin(2*x))-(52*cos(2*x)))+50;
    return cos(x);
}

/***************************************/
/* BISECTION KERNEL - ORIGINAL VERSION */
/***************************************/
__global__ void bisection(double *a, double *b, double *c, int N, int loopcnt)
{
    int tid = blockIdx.x*blockDim.x+threadIdx.x;
    int loops = 0;
    if (tid < N)
      while (loops < loopcnt){
        c[tid] = (a[tid] + b[tid])/2;
        if ((f(c[tid]) * f(a[tid])) < 0) b[tid] = c[tid];
        else a[tid] = c[tid];
        loops++;
        }
}

/************************************************/
/* BISECTION KERNEL - NUMERICAL RECIPES VERSION */
/************************************************/
// --- Using bisection, return the root of a function func known to lie between x1 and x2.
//     The root will be refined until its accuracy is xacc.

__global__ void bisection_NR(const double *d_x1, const double *d_x2, double *d_roots, const double xacc, const int loopcnt, const int N) {

    // --- loopcnt is the maximum allowed number of bisections.

    int tid = blockIdx.x*blockDim.x+threadIdx.x;
    if (tid < N) {
        double dx,xmid,rtb;

        double f1=f(d_x1[tid]);
        double fmid=f(d_x2[tid]);

        if (f1*fmid >= 0.0) d_roots[tid] = CUDART_NAN; 
        rtb = f1 < 0.0 ? (dx=d_x2[tid]-d_x1[tid],d_x1[tid]) : (dx=d_x1[tid]-d_x2[tid],d_x2[tid]); // --- Orient the search so that f>0
        for (int j=0;j<loopcnt;j++) { // --- lies at x+dx.
            fmid=f(xmid=rtb+(dx *= 0.5)); // --- Bisection loop.
            if (fmid <= 0.0) rtb=xmid;
            if (abs(dx) < xacc || fmid == 0.0) { d_roots[tid]=rtb; return; }
        }
        d_roots[tid] = CUDART_NAN;
    }
}

/*******/
/* INT */
/*******/
int main()
{
    int loops=100000;                   // --- Number of bisection iterations to run
    double x1=0.0, x2=10.0;             // --- Minimum and maximum values of the search interval
    double Deltax = 1.0;                // --- Sampling step of the search interval

    int N = (int)x2/(int)Deltax;        // --- Number of search intervales

    // --- Host-side memory allocations
    double *host_a = (double*)malloc(N*sizeof(double));
    double *host_b = (double*)malloc(N*sizeof(double));
    double *host_c = (double*)malloc(N*sizeof(double));

    // --- Device-side memory allocations
    double *dev_a; gpuErrchk(cudaMalloc(&dev_a, N*sizeof(double)));
    double *dev_b; gpuErrchk(cudaMalloc(&dev_b, N*sizeof(double)));
    double *dev_c; gpuErrchk(cudaMalloc(&dev_c, N*sizeof(double)));

    // --- Initialize vectors on host
    int i=0;
    while(i < N) {
        if((f(x1)*f(x1+Deltax))>0) x1 = x1 + Deltax;
        else {
            host_a[i] = x1;
            host_b[i] = x1+Deltax;
            x1 = x1 + Deltax;
            i++;
        }
    }

    // --- Copy host vectors to device
    gpuErrchk(cudaMemcpy(dev_a, host_a, N*sizeof(double), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(dev_b, host_b, N*sizeof(double), cudaMemcpyHostToDevice));

    bisection<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(dev_a, dev_b, dev_c, loops, N);
    gpuErrchk(cudaMemcpy(host_c, dev_c, N*sizeof(double), cudaMemcpyDeviceToHost));
    for(i=0; i<N; i++) printf("%3.12f\n",host_c[i]);
    printf("\n");

    bisection_NR<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(dev_a, dev_b, dev_c, 2.5e-13, loops, N);
    gpuErrchk(cudaMemcpy(host_c, dev_c, N*sizeof(double), cudaMemcpyDeviceToHost));
    for(i=0; i<N; i++) printf("%3.12f\n",host_c[i]);

    // --- Release device memory
    gpuErrchk(cudaFree(dev_a));
    gpuErrchk(cudaFree(dev_b));
    gpuErrchk(cudaFree(dev_c));

    // --- Release host memory
    free(host_a);
    free(host_b);
    free(host_c);

    return 0;
}