设备中用于改善结果的数学函数(CUDA)

时间:2016-09-19 11:35:26

标签: cuda

是否值得在设备(GPU)中执行数学函数pow(),以便改善代码的执行时间?

我从 Cuda Toolkit文档中找到了函数__powf():  http://docs.nvidia.com/cuda/cuda-c-programming-guide/#intrinsic-functions

所以我用pow()替换了__powf()函数调用,并且我为编译器使用了选项-use_fast_math,但我得到的结果是“nan”而不是双精度数。我应该如何改变我的代码来实现上述目标?

我的代码库.cu:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/time.h>                // for gettimeofday()
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
#include "magma.h"

我的部分代码.cu:

void function(double *cx, double *cy, double *R, int var, double pts[][2], int e) {

    magma_trans_t my_trans = MagmaNoTrans;
    magma_int_t info; 
    magma_int_t M, C;
    magma_int_t ldda, lddb;

    C = 3;
    M = var;
    int i;
    double Q[M];
    double a[3];
    int ret;
    double A[3][M];
    double pts_x[M], pts_y[M];
    double *dev_pts_x, *dev_pts_y, *devA, *devB, *pWork, lWorkQuery[1]; 

    /* Allocate device memory for the matrix (column-major) */
    ldda = ((M + 31) / 32) * 32;
    lddb = ldda;

    cudaMalloc((void **)&devA, (ldda * C) * sizeof(double));
    cudaMalloc((void **)&devB, (M) * sizeof(double));

    for (i = 0; i < M; i++) {
        pts_x[i] = pts[i][0];
        pts_y[i] = pts[i][1];
        A[0][i] = pts[i][0];
        A[1][i] = pts[i][1];
        A[2][i] = 1.0;
    }   

    cudaMalloc((void **)&dev_pts_x, (M) * sizeof(double));
    cudaMemcpy(dev_pts_x, pts_x, M * sizeof(double), cudaMemcpyHostToDevice);
    cudaMalloc((void **)&dev_pts_y, (M) * sizeof(double));
    cudaMemcpy(dev_pts_y, pts_y, M * sizeof(double), cudaMemcpyHostToDevice);

    // Kernel invocation
    dim3 threadsPerBlock(1, 1);
    dim3 numBlocks(M / threadsPerBlock.x, M / threadsPerBlock.y);
    call <<< numBlocks, threadsPerBlock >>> (var, dev_pts_x, dev_pts_y, devB);

    cublasSetMatrix(M, C, sizeof(double), A, M, devA, ldda);
    // cublasSetMatrix(M, 1, sizeof(double), B, M, devB, M);

    /* Resolve the LLSP using MAGMA */
    ret = magma_dgels_gpu(my_trans, M, C, 1 , devA, ldda, devB, M, lWorkQuery, -1, &info);

    int lwork = (int)lWorkQuery[0];
    //printf("Optimal work space %d\n", lwork);
    pWork = (double*)malloc((lwork) * sizeof(double)); 

    ret = magma_dgels_gpu(my_trans, M, C, 1, devA, ldda, devB, M, pWork, lwork, &info);

    magma_dgetmatrix(M, 1, devB, lddb, Q, M);

    a[2] = Q[2];
    *cx = Q[0];
    *cy = Q[1];

    *R = sqrt((pow(*cx, 2)+pow(*cy, 2)) - a[2]);
}

__global__ void call(int v, double *pts_x, double *pts_y, double *B) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < v) {
        B[i] = -(pow(pts_x[i], 2.0) + pow(pts_y[i], 2.0));
    }
}

1 个答案:

答案 0 :(得分:2)

您使用pow来对数字进行平方,这是非常低效的。使用带内联函数的乘法:

static inline double square(double x) { return x * x; }

您可能会收到NaN个值,因为传递给pow的数字为负数。这应该不是问题,但pow__powf的cuda实施可能不支持。

另请注意,使用hypot()函数可以更直接地计算两点之间的欧几里德距离:

double hypot(double x, double y);

最后,正如Weather Vane强调的那样,如果您感兴趣的是与另一个距离进行比较,您可能不需要取平方根。