我正在尝试使用共享内存在cuda c中做一个关于dot产品的简单教程;代码非常简单,它基本上在两个数组的元素之间进行产品,然后对每个块的结果求和:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#define imin(a,b) (a<b?a:b)
const int N = 33*1024;
const int threadsPerBlock = 256;
const int blocksPerGrid = imin(32 , (N+threadsPerBlock-1)/threadsPerBlock);
__global__ void dot(float *a, float *b, float *c){
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < N){
temp += a[tid]*b[tid];
tid += blockDim.x*gridDim.x; /* Aggiorno l'indice per l'evenutale overshoot. */
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i != 0){ /
if(cacheIndex < i){
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
}
if(cacheIndex == 0){
c[blockIdx.x] = cache[0];
}
}
int main(void){
cudaError_t err = cudaSuccess;
float a[N], b[N], c[blocksPerGrid];
float *d_a, *d_b, *d_c;
int i;
for(i=0;i<N;i++){
a[i] = i;
b[i] = i*2;
}
for(i=0; i<blocksPerGrid;i++){
c[i] = 0;
}
err = cudaMalloc((void**)&d_a, N*sizeof(float));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector a (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMalloc((void**)&d_b, N*sizeof(float));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector b (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMalloc((void**)&d_c, blocksPerGrid*sizeof(float));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector c (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
/* Copio i valori dei vettori a e b nello spazio di memoria allocato precedentemente nel device. */
err = cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector a from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector b from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_c, c, blocksPerGrid*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector c from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
dot<<<blocksPerGrid,threadsPerBlock>>>(d_a, d_b, d_c); err = cudaGetLastError();
err = cudaMemcpy(c, d_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector c from device to host (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_a);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector a (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_b);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector b (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_c);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector c (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
float result = 0;
for(i=0;i<blocksPerGrid;i++){
result += c[i];
}
printf("il risultato finale è: %.2f\n", result);
return 0;
}
这段代码与Cuda by Example书中的代码相同,唯一的区别在于向量a,b和c的定义(我定义它们的方式不应该是问题,因为我&# 39;已经多次完成了。)
问题在于:当我尝试运行该程序时,它会崩溃!终端说问题是:Failed to copy vector c from device to host (error code the launch timed out and was terminated)!
因为我认为我已经以适当的方式分配了矢量c,这种接缝很奇怪...有没有人知道我做错了什么?它是全局函数还是主要的错误?
答案 0 :(得分:2)
你的内核中有一个无限循环。你得到错误的原因是因为你所在的平台上有看门狗超时,看门狗正在杀死内核执行。
考虑这段代码:
int i = blockDim.x/2;
while(i != 0){
if(cacheIndex < i){
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
}
如果i
小于cacheIndex
,则只将循环索引(i
)除以2。对于其他线程,一旦该线程退出if语句,i
将始终保持相同的值。对于那些线程,永远不会退出while循环(i
永远不会等于零)。您想要为所有线程划分i
变量。像这样:
int i = blockDim.x/2;
while(i != 0){
if(cacheIndex < i){
cache[cacheIndex] += cache[cacheIndex + i];
}
__syncthreads();
i /= 2;
}
请注意,我已将__syncthreads()
移出if语句。这可能不是解决您的问题所必需的,但在技术上不正确,因为我们通常希望所有线程都参与__syncthreads()
语句。如果条件在所有线程中评估相同,则仅允许在条件代码中使用 - 这是documented in the programming guide。
如果你将这方面的代码与cuda by examples source code for dot.cu in chapter 5中的代码进行比较,我认为你会发现它们不相同。