是否值得在设备(GPU)中执行数学函数pow()
,以便改善代码的执行时间?
我从 Cuda Toolkit文档中找到了函数__powf()
:
http://docs.nvidia.com/cuda/cuda-c-programming-guide/#intrinsic-functions
所以我用pow()
替换了__powf()
函数调用,并且我为编译器使用了选项-use_fast_math
,但我得到的结果是“nan”而不是双精度数。我应该如何改变我的代码来实现上述目标?
我的代码库.cu:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/time.h> // for gettimeofday()
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
#include "magma.h"
我的部分代码.cu:
void function(double *cx, double *cy, double *R, int var, double pts[][2], int e) {
magma_trans_t my_trans = MagmaNoTrans;
magma_int_t info;
magma_int_t M, C;
magma_int_t ldda, lddb;
C = 3;
M = var;
int i;
double Q[M];
double a[3];
int ret;
double A[3][M];
double pts_x[M], pts_y[M];
double *dev_pts_x, *dev_pts_y, *devA, *devB, *pWork, lWorkQuery[1];
/* Allocate device memory for the matrix (column-major) */
ldda = ((M + 31) / 32) * 32;
lddb = ldda;
cudaMalloc((void **)&devA, (ldda * C) * sizeof(double));
cudaMalloc((void **)&devB, (M) * sizeof(double));
for (i = 0; i < M; i++) {
pts_x[i] = pts[i][0];
pts_y[i] = pts[i][1];
A[0][i] = pts[i][0];
A[1][i] = pts[i][1];
A[2][i] = 1.0;
}
cudaMalloc((void **)&dev_pts_x, (M) * sizeof(double));
cudaMemcpy(dev_pts_x, pts_x, M * sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&dev_pts_y, (M) * sizeof(double));
cudaMemcpy(dev_pts_y, pts_y, M * sizeof(double), cudaMemcpyHostToDevice);
// Kernel invocation
dim3 threadsPerBlock(1, 1);
dim3 numBlocks(M / threadsPerBlock.x, M / threadsPerBlock.y);
call <<< numBlocks, threadsPerBlock >>> (var, dev_pts_x, dev_pts_y, devB);
cublasSetMatrix(M, C, sizeof(double), A, M, devA, ldda);
// cublasSetMatrix(M, 1, sizeof(double), B, M, devB, M);
/* Resolve the LLSP using MAGMA */
ret = magma_dgels_gpu(my_trans, M, C, 1 , devA, ldda, devB, M, lWorkQuery, -1, &info);
int lwork = (int)lWorkQuery[0];
//printf("Optimal work space %d\n", lwork);
pWork = (double*)malloc((lwork) * sizeof(double));
ret = magma_dgels_gpu(my_trans, M, C, 1, devA, ldda, devB, M, pWork, lwork, &info);
magma_dgetmatrix(M, 1, devB, lddb, Q, M);
a[2] = Q[2];
*cx = Q[0];
*cy = Q[1];
*R = sqrt((pow(*cx, 2)+pow(*cy, 2)) - a[2]);
}
__global__ void call(int v, double *pts_x, double *pts_y, double *B) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < v) {
B[i] = -(pow(pts_x[i], 2.0) + pow(pts_y[i], 2.0));
}
}
答案 0 :(得分:2)
您使用pow
来对数字进行平方,这是非常低效的。使用带内联函数的乘法:
static inline double square(double x) { return x * x; }
您可能会收到NaN
个值,因为传递给pow
的数字为负数。这应该不是问题,但pow
或__powf
的cuda实施可能不支持。
另请注意,使用hypot()
函数可以更直接地计算两点之间的欧几里德距离:
double hypot(double x, double y);
最后,正如Weather Vane强调的那样,如果您感兴趣的是与另一个距离进行比较,您可能不需要取平方根。