#include <stdio.h>
#define cols 500
#define rows 50
#define arraySize rows * cols
__global__ void addOnGPU(int *a, int *b, int *c) {
// Only use data at this index
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < arraySize) c[tid] = a[tid] + b[tid];
int main()
FILE *ppm_fp;
int a[arraySize], b[arraySize], c[arraySize];
int *dev_a, *dev_b, *dev_c;
int i, j;
int threadsperblock = 256;
int blocks = (arraySize + threadsperblock - 1) / threadsperblock;
// Allocate memory on GPU for the three vectors
cudaError_t cudaStatus = cudaMalloc((void **) &dev_a, arraySize * sizeof(int));
cudaStatus = cudaMalloc((void **) &dev_b, arraySize * sizeof(int));
cudaStatus = cudaMalloc((void **) &dev_c, arraySize * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Unable to allocate memory on the GPU!");
return 1;
// Assign values to input vectors
for (i = 0, j = 0; i < arraySize; i++, j++) {
a[i] = i;
b[i] = i * i;
// Copy input values to allocated vectors in GPU memory
cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_b, b, arraySize * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Unable to copy input vectors to the GPU!");
return 1;
// Add vectors in parallel and save results in dev_c
addOnGPU<<<blocks, threadsperblock>>>(dev_a, dev_b, dev_c);
// Copy results from dev_c to local c vector
cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Unable to copy input vectors to the GPU!");
return 1;
ppm_fp = fopen("image.ppm", "wb");
fprintf(ppm_fp, "P6\n%d %d\n255\n", cols, rows);
for (i = 0; i < arraySize; i++) {
if (i % (3 * cols) == 0) fprintf(ppm_fp, "\n");
fprintf(ppm_fp, "%d ", c[i]);
// Display contents of output vector
for (i = 0; i < arraySize; i++) {
printf("%d + %d = %d\n", a[i], b[i], c[i]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
return 0;
按照目前的情况,程序可以使用cols和rows的值运行。如果我将行增加到500,则程序崩溃。我已经包含了一些调试打印语句,试图找到它崩溃的地方,但是一旦我运行它就会崩溃。我在Visual Studio 2013上运行它(其中我使用的是新手,并且更熟悉VI,linux和手动编译)。我有一个GTX 580 3GB版本,如果这很重要。我知道我无法超越任何内存限制,并且我没有超过可以创建的块的65536(或者是65535)限制,或者每块的512个线程限制。关于出了什么问题的任何想法?
答案 0 :(得分:5)
您观察到的崩溃与CUDA无关,并且是由于C / C ++静态数组分配达到了内存限制
int a[arraySize], b[arraySize], c[arraySize];
动态分配的数组int* a = (int*)malloc(arraySize*sizeof(int));
What and where are the stack and heap?
What is the canonical way to check for errors using the CUDA runtime API?
现在也在CUDA Tag Wiki中提到了这一点。它可能会帮助你自己排除CUDA错误。