我试图在CUDA中实现 Bisection Method 。该方法能够近似来自应用程序的特征值(Bisection Method)。我有一些关于如何做的问题。 这是我的代码:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
double f(double x)
{
//return ((5*sin(2*x))-(52*cos(2*x)))+50;
return cos(x);
}
double absoluto(double n){
if(n<0) n=n*-1;
return(n);
}
// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
if (id < n)
c[id] = (a[id] + b[id])/2;
}
int main( int argc, char* argv[] )
{
int i=0;
double malla = 1.0;
double x1=0.0 , x2=10.0 , j=0.0;
int n = (int)x2/(int)malla;
double *host_a;
double *host_b;
double *host_c;
double *dev_a;
double *dev_b;
double *dev_c;
size_t bytes = n*sizeof(double);
host_a = (double*)malloc(bytes);
host_b = (double*)malloc(bytes);
host_c = (double*)malloc(bytes);
cudaMalloc(&dev_a, bytes);
cudaMalloc(&dev_b, bytes);
cudaMalloc(&dev_c, bytes);
// Initialize vectors on host
for( j = 0.0; j < n; j=j+1.0 ) {
if((f(x1)*f(x1+malla))>0){
x1 = x1 + malla;
i++;
}
else{
host_a[i] = x1;
host_b[i] = x1+malla;
x1 = x1 + malla;
i++;
}
}
int blockSize, gridSize;
blockSize = 1024;
gridSize = (int)ceil((float)n/blockSize);
i=0;
// Copy host vectors to device
cudaMemcpy( dev_a, host_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, host_b, bytes, cudaMemcpyHostToDevice);
// Execute the kernel
biseccion<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, n);
// Copy array back to host
cudaMemcpy( host_c, dev_c, bytes, cudaMemcpyDeviceToHost );
i=0;
for(j=0.0;j<n;j++){
printf("%f\n",host_c[i])
i++;
}
// Release device memory
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
// Release host memory
free(host_a);
free(host_b);
free(host_c);
return 0;
}
上面的代码只返回第一个aproximation,所以我必须添加一些比较。例如:
如果c
和a
的产品为负数,则a
,b
的新值为a
,c
;如果没有,新值将为b
,c
用于内核的下一次迭代,当然这必须是循环。
我的第一个问题是如何在内核中执行循环以继续近似?
其次,我如何实现if
来比较内核上的值?
最后,像if
等控制结构会降低性能吗?
答案 0 :(得分:3)
您需要定义停止标准或收敛标准 - 何时停止近似?让我们假设你的停止标准只是二分循环的一些迭代。我们可以将它作为参数传递给内核。
然后我们可以像这样重写内核:
// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n, int loopcnt)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
int loops = 0;
if (id < n)
while (loops < loopcnt){
c[id] = (a[id] + b[id])/2;
if ((f(c[id]) * f(a[id])) < 0) b[id] = c[id];
else a[id] = c[id];
loops++;
}
}
我认为如果你研究我对内核所做的更改,你会发现它几乎就像在普通的C / C ++代码中编写它一样。
为了使上面的内核工作,我们必须告诉编译器我们希望f(x)
函数可以在主机或设备上使用,我们使用__host__ __device__
装饰器执行此操作:< / p>
__host__ __device__ double f(double x)
{
//return ((5*sin(2*x))-(52*cos(2*x)))+50;
return cos(x);
}
请注意,上述更改未得到特别优化。例如,我们存储在全局内存中的变量有相当多的重复使用,例如a[id]
,b[id]
和c[id]
。我们可能会使用共享内存(或者甚至只是本地线程变量 - 并不多),只有在我们完成循环时才将结果写回全局内存。
我必须对您的代码进行一些其他更改,以便以对我有意义的方式使用。以下是代码的完整修改版本:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
__host__ __device__ double f(double x)
{
//return ((5*sin(2*x))-(52*cos(2*x)))+50;
return cos(x);
}
double absoluto(double n){
if(n<0) n=n*-1;
return(n);
}
// Kernel CUDA
__global__ void biseccion(double *a, double *b, double *c, int n, int loopcnt)
{
int id = blockIdx.x*blockDim.x+threadIdx.x;
int loops = 0;
if (id < n)
while (loops < loopcnt){
c[id] = (a[id] + b[id])/2;
if ((f(c[id]) * f(a[id])) < 0) b[id] = c[id];
else a[id] = c[id];
loops++;
}
}
int main( int argc, char* argv[] )
{
int i=0;
int loops=1000; // this is the number of bisection iterations to run
double malla = 1.0;
double x1=0.0 , x2=10.0 , j=0.0;
int n = (int)x2/(int)malla;
double *host_a;
double *host_b;
double *host_c;
double *dev_a;
double *dev_b;
double *dev_c;
size_t bytes = n*sizeof(double);
host_a = (double*)malloc(bytes);
host_b = (double*)malloc(bytes);
host_c = (double*)malloc(bytes);
cudaMalloc(&dev_a, bytes);
cudaMalloc(&dev_b, bytes);
cudaMalloc(&dev_c, bytes);
// Initialize vectors on host
while( i < n) {
if((f(x1)*f(x1+malla))>0){
x1 = x1 + malla;
}
else{
host_a[i] = x1;
host_b[i] = x1+malla;
x1 = x1 + malla;
i++;
}
}
int blockSize, gridSize;
blockSize = 256;
gridSize = (int)ceil((float)n/blockSize);
i=0;
// Copy host vectors to device
cudaMemcpy( dev_a, host_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, host_b, bytes, cudaMemcpyHostToDevice);
// Execute the kernel
biseccion<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, n, loops);
// Copy array back to host
cudaMemcpy( host_c, dev_c, bytes, cudaMemcpyDeviceToHost );
i=0;
for(j=0.0;j<n;j++){
printf("%f\n",host_c[i]);
i++;
}
// Release device memory
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
// Release host memory
free(host_a);
free(host_b);
free(host_c);
return 0;
}
其他几点说明:
loops
变量来定义要运行多少个二分循环。 当我运行代码时,我得到这样的结果:
1.570796
4.712389
7.853982
10.995574
14.137167
17.278760
20.420352
23.561945
26.703538
29.845130
你会注意到第一个结果是pi / 2,并且每个后续结果都添加了pi,所以我认为这是cos(x)的前10个根的正确结果。
答案 1 :(得分:1)
Robert Crovella已经指出你的问题是停止规则,它是根据迭代次数给出的。
对于最简单的更复杂的二分法,停止规则也可以与目标精度相关。下面我将在CUDA中提供一个版本的二分法,改编自 C ++中的数值配方一书中提供的版本,您也可以通过该版本设置目标精度。
也许,通过调整eigenvalues
CUDA SDK样本中利用的二分内核,可以获得计算上更复杂的二分法。
该方法的新版本似乎更准确。下面是一些结果:
No target accuracy
1.571289062500
4.453613281250
6.504882812500
10.546875000000
13.171386718750
Target accuracy
1.570796326795
4.712388980385
7.853981633975
10.995574287564
14.137166941154
Actual roots
1.570796326794897
4.712388980384690
7.853981633974483
10.995574287564276
14.137166941154069
可以通过上述书中提供的更好的初始包围再次实现。
这是代码
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <math_constants.h>
#define BLOCKSIZE 512
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/************************************/
/* FUNCTION TO SEARCH THE ROOTS FOR */
/************************************/
__host__ __device__ double f(double x)
{
//return ((5*sin(2*x))-(52*cos(2*x)))+50;
return cos(x);
}
/***************************************/
/* BISECTION KERNEL - ORIGINAL VERSION */
/***************************************/
__global__ void bisection(double *a, double *b, double *c, int N, int loopcnt)
{
int tid = blockIdx.x*blockDim.x+threadIdx.x;
int loops = 0;
if (tid < N)
while (loops < loopcnt){
c[tid] = (a[tid] + b[tid])/2;
if ((f(c[tid]) * f(a[tid])) < 0) b[tid] = c[tid];
else a[tid] = c[tid];
loops++;
}
}
/************************************************/
/* BISECTION KERNEL - NUMERICAL RECIPES VERSION */
/************************************************/
// --- Using bisection, return the root of a function func known to lie between x1 and x2.
// The root will be refined until its accuracy is xacc.
__global__ void bisection_NR(const double *d_x1, const double *d_x2, double *d_roots, const double xacc, const int loopcnt, const int N) {
// --- loopcnt is the maximum allowed number of bisections.
int tid = blockIdx.x*blockDim.x+threadIdx.x;
if (tid < N) {
double dx,xmid,rtb;
double f1=f(d_x1[tid]);
double fmid=f(d_x2[tid]);
if (f1*fmid >= 0.0) d_roots[tid] = CUDART_NAN;
rtb = f1 < 0.0 ? (dx=d_x2[tid]-d_x1[tid],d_x1[tid]) : (dx=d_x1[tid]-d_x2[tid],d_x2[tid]); // --- Orient the search so that f>0
for (int j=0;j<loopcnt;j++) { // --- lies at x+dx.
fmid=f(xmid=rtb+(dx *= 0.5)); // --- Bisection loop.
if (fmid <= 0.0) rtb=xmid;
if (abs(dx) < xacc || fmid == 0.0) { d_roots[tid]=rtb; return; }
}
d_roots[tid] = CUDART_NAN;
}
}
/*******/
/* INT */
/*******/
int main()
{
int loops=100000; // --- Number of bisection iterations to run
double x1=0.0, x2=10.0; // --- Minimum and maximum values of the search interval
double Deltax = 1.0; // --- Sampling step of the search interval
int N = (int)x2/(int)Deltax; // --- Number of search intervales
// --- Host-side memory allocations
double *host_a = (double*)malloc(N*sizeof(double));
double *host_b = (double*)malloc(N*sizeof(double));
double *host_c = (double*)malloc(N*sizeof(double));
// --- Device-side memory allocations
double *dev_a; gpuErrchk(cudaMalloc(&dev_a, N*sizeof(double)));
double *dev_b; gpuErrchk(cudaMalloc(&dev_b, N*sizeof(double)));
double *dev_c; gpuErrchk(cudaMalloc(&dev_c, N*sizeof(double)));
// --- Initialize vectors on host
int i=0;
while(i < N) {
if((f(x1)*f(x1+Deltax))>0) x1 = x1 + Deltax;
else {
host_a[i] = x1;
host_b[i] = x1+Deltax;
x1 = x1 + Deltax;
i++;
}
}
// --- Copy host vectors to device
gpuErrchk(cudaMemcpy(dev_a, host_a, N*sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(dev_b, host_b, N*sizeof(double), cudaMemcpyHostToDevice));
bisection<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(dev_a, dev_b, dev_c, loops, N);
gpuErrchk(cudaMemcpy(host_c, dev_c, N*sizeof(double), cudaMemcpyDeviceToHost));
for(i=0; i<N; i++) printf("%3.12f\n",host_c[i]);
printf("\n");
bisection_NR<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(dev_a, dev_b, dev_c, 2.5e-13, loops, N);
gpuErrchk(cudaMemcpy(host_c, dev_c, N*sizeof(double), cudaMemcpyDeviceToHost));
for(i=0; i<N; i++) printf("%3.12f\n",host_c[i]);
// --- Release device memory
gpuErrchk(cudaFree(dev_a));
gpuErrchk(cudaFree(dev_b));
gpuErrchk(cudaFree(dev_c));
// --- Release host memory
free(host_a);
free(host_b);
free(host_c);
return 0;
}