我正在cuda中编写一个程序,将其带到向量上并使用它们进行mse mse = [sum((v1-v2)^ 2)]。 mse1函数确实具有ves [i] =(v1 [i] -v2 [i])^ 2,而我是靠自己完成的。 reduce6是我在互联网上发现的将向量的每个值添加为树的函数,该函数不起作用。 我是cuda的新手,我只是复制和修改一些发现的代码。有两个地方没有编译,我也不知道为什么。请有人告诉我为什么不编译以及如何解决它。 顺便说一句,我将使用矢量大小为10000的该程序,您向我推荐多少块和线程?
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#define N 10000 //hay que cambiarlo
#define THREADS_PER_BLOCK 512 //hay que cambiarlo
/*NUEVO PA PROBAR
__global__ void add_arrays_gpu(int* a, int *b, int* c)
{
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
//---------NUEVO FIN PA PROBAR*/
//__global__ void addKernel(int *c, const int *a, const int *b)//voy a quitar los const
//*
//SE LO COME
__global__ void mse1( float *vec1, float *vec2, float *res, int n)
{
float sol = 0.0f;
int index = threadIdx.x + blockIdx.x* blockDim.x;
if (index < n)
res[threadIdx.x] = __powf(vec1[threadIdx.x] - vec2[threadIdx.x], 2);
__syncthreads();//no sé si es necesario
//*/
}//*/
//* SE LO COME //int->float
template <unsigned int blockSize>__device__ void warpReduce(volatile float *sdata, unsigned int tid) {
//*
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
///*/
}
//*/
// voy a cambiar int->float
// AQUI FALLA
template <unsigned int blockSize> __global__ void reduce6(float *g_idata, float *g_odata, unsigned int n)
{//g_idata is our vector//ES DENTRO DONDE FALLA
//*
extern __shared__ float sdata[];// INT->FLOAT
unsigned int tid = threadIdx.x;//AHORA TAMPOCO
unsigned int i = blockIdx.x*(blockSize * 2) + tid;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
while (i < n) { sdata[tid] += g_idata[i] + g_idata[i + blockSize]; i += gridSize; }
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) warpReduce(sdata, tid);// warpReduce(sdata, tid) is not working
if (tid == 0) g_odata[blockIdx.x] = sdata[0];//result in g_odata[blockIdx.x]
}
//*/
int main()
{
float *a, *b, *c, *d;// host copies of a, b, c
float *d_a, *d_b, *d_c, *d_d;// device copies of a, b, c
int size = sizeof(float)*N;
//* Allocate space for device copies of a, b, c, d//se lo come
cudaMallocManaged(&d_a, size);//
cudaMallocManaged(&d_b, size);///
cudaMallocManaged(&d_c, size);///
cudaMallocManaged(&d_d, size);//*/
//los originales
/* 2653
cudaMalloc((void**)&d_a, size);
cudaMalloc((void**)&d_b, size);
cudaMalloc((void**)&d_c, size);
cudaMalloc((void**)&d_d, size);
//*/
//* Allocspace for host copies of a, b, c//se lo come
a = (float*)malloc(size);
b = (float*)malloc(size);
c = (float*)malloc(size);
d = (float*)malloc(size);//*/
// Copy inputs to device//se lo come
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);// Launch add() kernel on GPU
//SE LO COME
mse1 <<< THREADS_PER_BLOCK, THREADS_PER_BLOCK >>> (d_a, d_b, d_c, N);// Copy result back to host//N number of elements in a vector 10000
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);// Cleanup//no se si esto esta bien
cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);
int dimGrid = 10, dimBlock = 12, smemSize = 78, threads=4;
//This line reduce6 doesn't work
//reduce6 <<< THREADS_PER_BLOCK, THREADS_PER_BLOCK >>> (d_c, d_d, N);//el numero de vectores no sé cual poner
//I add this and now this lines works correctly
switch (threads)
{
case 512:
reduce6<512> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
case 256:
reduce6<256> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
case 128:
reduce6<128> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
case 64:
reduce6< 64> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
case 32:
reduce6< 32> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
case 16:
reduce6< 16> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
case 8:
reduce6< 8> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
case 4:
reduce6< 4> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
case 2:
reduce6< 2> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
case 1:
reduce6< 1> <<< dimGrid, dimBlock, smemSize >>> (d_c, d_d, N); break;
}
cudaMemcpy(&d, d_d, size, cudaMemcpyDeviceToHost);// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_d);
return 0;
}
编译错误为
Severity Code Description Project File Line Suppression State
Error (active) E1097 unknown attribute "__device__" my_mse C:\Users\COM\Documents\Visual Studio 2017\Projects\my_mse\my_mse\kernel.cu 29
Severity Code Description Project File Line Suppression State
Error (active) E0029 expected an expression my_mse C:\Users\COM\Documents\Visual Studio 2017\Projects\my_mse\my_mse\kernel.cu 103
Severity Code Description Project File Line Suppression State
Error no instance of function template "warpReduce" matches the argument list my_mse C:\Users\COM\Documents\Visual Studio 2017\Projects\my_mse\my_mse\kernel.cu 56
Severity Code Description Project File Line Suppression State
Error MSB3721 The command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\bin\nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64" -x cu -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -g -DWIN32 -DWIN64 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc141.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\kernel.cu.obj "C:\Users\COM\Documents\Visual Studio 2017\Projects\my_mse\my_mse\kernel.cu"" exited with code 1. my_mse C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\Common7\IDE\VC\VCTargets\BuildCustomizations\CUDA 10.1.targets 764