cuda c ++调用内核不起作用,为什么?

时间:2019-12-16 03:13:52

标签: c++ cuda

我正在cuda中编写一个程序,将其带到向量上并使用它们进行mse mse = [sum((v1-v2)^ 2)]。 mse1函数确实具有ves [i] =(v1 [i] -v2 [i])^ 2,而我是靠自己完成的。 reduce6是我在互联网上发现的将向量的每个值添加为树的函数,该函数不起作用。 我是cuda的新手,我只是复制和修改一些发现的代码。有两个地方没有编译,我也不知道为什么。请有人告诉我为什么不编译以及如何解决它。 顺便说一句,我将使用矢量大小为10000的该程序,您向我推荐多少块和线程?

#ifndef __CUDACC__
#define __CUDACC__
#endif
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>

#include <stdio.h>

#define N 10000 //hay que cambiarlo
#define THREADS_PER_BLOCK 512 //hay que cambiarlo
/*NUEVO PA PROBAR
__global__ void add_arrays_gpu(int* a, int *b, int* c)
{
    c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
//---------NUEVO FIN PA PROBAR*/
//__global__ void addKernel(int *c, const int *a, const int *b)//voy a quitar los const
//*

//SE LO COME
__global__ void mse1( float *vec1,  float *vec2, float *res, int n)
{

    float sol = 0.0f;
    int index = threadIdx.x + blockIdx.x* blockDim.x;
    if (index < n)
        res[threadIdx.x] = __powf(vec1[threadIdx.x] - vec2[threadIdx.x], 2);

    __syncthreads();//no sé si es necesario
    //*/
}//*/
//*  SE LO COME //int->float 
template <unsigned int blockSize>__device__ void warpReduce(volatile float *sdata, unsigned int tid) {
    //*
    if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
    if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
    if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
    if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
    if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
    if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
    ///*/
}
//*/
// voy a cambiar int->float
// AQUI FALLA
template <unsigned int blockSize> __global__ void reduce6(float *g_idata, float *g_odata, unsigned int n) 
{//g_idata is our vector//ES DENTRO DONDE FALLA
    //*
    extern __shared__  float sdata[];// INT->FLOAT 
    unsigned int tid = threadIdx.x;//AHORA TAMPOCO
    unsigned int i = blockIdx.x*(blockSize * 2) + tid;
    unsigned int gridSize = blockSize * 2 * gridDim.x;
    sdata[tid] = 0;
    while (i < n) { sdata[tid] += g_idata[i] + g_idata[i + blockSize];  i += gridSize; }
    __syncthreads();
    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }

    if (tid < 32) warpReduce(sdata, tid);// warpReduce(sdata, tid) is not working
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];//result in g_odata[blockIdx.x]

}
//*/
int main()
{
    float *a, *b, *c, *d;// host copies of a, b, c
    float *d_a, *d_b, *d_c, *d_d;// device copies of a, b, c
    int size = sizeof(float)*N;

    //* Allocate space for device copies of a, b, c, d//se lo come
    cudaMallocManaged(&d_a, size);// 
    cudaMallocManaged(&d_b, size);/// 
    cudaMallocManaged(&d_c, size);/// 
    cudaMallocManaged(&d_d, size);//*/ 
    //los originales
    /* 2653
    cudaMalloc((void**)&d_a, size);
    cudaMalloc((void**)&d_b, size);
    cudaMalloc((void**)&d_c, size);
    cudaMalloc((void**)&d_d, size);
    //*/
    //* Allocspace for host copies of a, b, c//se lo come
    a = (float*)malloc(size);
    b = (float*)malloc(size);
    c = (float*)malloc(size);
    d = (float*)malloc(size);//*/

    // Copy inputs to device//se lo come
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);// Launch add() kernel on GPU

    //SE LO COME
    mse1 <<< THREADS_PER_BLOCK, THREADS_PER_BLOCK >>> (d_a, d_b, d_c, N);// Copy result back to host//N number of elements in a vector 10000

    cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);// Cleanup//no se si esto esta bien
    cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);

int dimGrid = 10, dimBlock = 12, smemSize = 78, threads=4;

        //This line reduce6 doesn't work
    //reduce6 <<< THREADS_PER_BLOCK, THREADS_PER_BLOCK >>> (d_c, d_d, N);//el numero de vectores no sé cual poner
//I add this and now this lines works correctly
    switch (threads)
    {
    case 512:
        reduce6<512> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    case 256:
        reduce6<256> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    case 128:
        reduce6<128> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    case 64:
        reduce6< 64> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    case 32:
        reduce6< 32> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    case 16:
        reduce6< 16> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    case 8:
        reduce6< 8> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    case 4:
        reduce6< 4> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    case 2:
        reduce6< 2> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    case 1:
        reduce6< 1> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
    }

    cudaMemcpy(&d, d_d, size, cudaMemcpyDeviceToHost);// Cleanup

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFree(d_d);

    return 0;
}


编译错误为

Severity    Code    Description Project File    Line    Suppression State
Error (active)  E1097   unknown attribute "__device__"  my_mse  C:\Users\COM\Documents\Visual Studio 2017\Projects\my_mse\my_mse\kernel.cu  29  
Severity    Code    Description Project File    Line    Suppression State
Error (active)  E0029   expected an expression  my_mse  C:\Users\COM\Documents\Visual Studio 2017\Projects\my_mse\my_mse\kernel.cu  103 
Severity    Code    Description Project File    Line    Suppression State
Error       no instance of function template "warpReduce" matches the argument list my_mse  C:\Users\COM\Documents\Visual Studio 2017\Projects\my_mse\my_mse\kernel.cu  56  
Severity    Code    Description Project File    Line    Suppression State
Error   MSB3721 The command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\bin\nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64" -x cu  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 --compile -cudart static  -g   -DWIN32 -DWIN64 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc141.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\kernel.cu.obj "C:\Users\COM\Documents\Visual Studio 2017\Projects\my_mse\my_mse\kernel.cu"" exited with code 1. my_mse  C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\Common7\IDE\VC\VCTargets\BuildCustomizations\CUDA 10.1.targets 764 

0 个答案:

没有答案