在cublas中是否有一个函数可以将sigmoid函数应用于向量?

时间:2015-03-03 00:35:20

标签: cuda gpu cublas

正如标题所说,我想在带有函数的向量中进行元素操作。我想知道cublas库中是否有任何函数可以做到这一点?

1 个答案:

答案 0 :(得分:3)

我不知道合适的CUBLAS功能可以帮助完成任务。但是,您可以轻松地编写自己的代码,该代码将sigmoid函数或任何其他单参数函数应用于向量。请注意,在大多数情况下,此类代码将受内存限制而非计算限制。有关工作示例,请参阅下面的CUDA计划,特别是sigmoid_kernel()。程序的输出应该如下所示:

source[0]= 0.0000000000000000e+000  source[99999]= 9.9999000000000005e-001
result[0]= 5.0000000000000000e-001  result[99999]= 7.3105661250612963e-001

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define DEFAULT_LEN   100000

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

__device__ __forceinline__ double sigmoid (double a)
{
    return 1.0 / (1.0 + exp (-a));
}

__global__ void sigmoid_kernel (const double * __restrict__ src, 
                                double * __restrict__ dst, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
        dst[i] = sigmoid (src[i]);
    }
}    

int main (void)
{
    double *source, *result;
    double *d_a = 0, *d_b = 0;

    int len = DEFAULT_LEN;

    /* Allocate memory on host */
    source = (double *)malloc (len * sizeof (source[0]));
    if (!source) return EXIT_FAILURE;
    result = (double *)malloc (len * sizeof (result[0]));
    if (!result) return EXIT_FAILURE;

    /* create source data */
    for (int i = 0; i < len; i++) source [i] = i * 1e-5;

    /* spot check of source data */
    printf ("source[0]=% 23.16e  source[%d]=% 23.16e\n", 
            source[0], len-1, source[len-1]);

    /* Allocate memory on device */
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * len));
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * len));

    /* Push source data to device */
    CUDA_SAFE_CALL (cudaMemcpy (d_a, source, sizeof(d_a[0]) * len,
                                cudaMemcpyHostToDevice));

    /* Compute execution configuration */
    dim3 dimBlock(256);
    int threadBlocks = (len + (dimBlock.x - 1)) / dimBlock.x;
    if (threadBlocks > 65520) threadBlocks = 65520;
    dim3 dimGrid(threadBlocks);

    sigmoid_kernel<<<dimGrid,dimBlock>>>(d_a, d_b, len);
    CHECK_LAUNCH_ERROR();

    /* retrieve results from device */
    CUDA_SAFE_CALL (cudaMemcpy (result, d_b, sizeof (result[0]) * len,
                                cudaMemcpyDeviceToHost));

    /* spot check of  results */
    printf ("result[0]=% 23.16e  result[%d]=% 23.16e\n", 
            result[0], len-1, result[len-1]);

    /* free memory on host and device */
    CUDA_SAFE_CALL (cudaFree(d_a));
    CUDA_SAFE_CALL (cudaFree(d_b));
    free (result);
    free (source);

    return EXIT_SUCCESS;
}