我正在尝试通过编写一个非常简单的CUDA程序来测试一个功能。该程序只添加两个向量并显示结果。它有CPU和GPU解算器,它们应该显示相同的结果。程序将输入作为要生成的数组的大小(如此./test [numelements])。
我的程序问题是它适用于3个或更少的元素。除此之外,错误如下所示:
*** Error in `./test': free(): invalid next size (fast): 0x0000000001927070 ***
======= Backtrace: =========
/lib/x86_64-linux-gnu/libc.so.6(+0x80a46)[0x7fd382a31a46]
/usr/lib/nvidia-current/libcuda.so(+0x770a1c)[0x7fd3821b8a1c]
/usr/lib/nvidia-current/libcuda.so(+0x770b6f)[0x7fd3821b8b6f]
/usr/lib/nvidia-current/libcuda.so(+0x268b7e)[0x7fd381cb0b7e]
/usr/lib/nvidia-current/libcuda.so(+0x1b1a10)[0x7fd381bf9a10]
/usr/lib/nvidia-current/libcuda.so(+0xe7efd)[0x7fd381b2fefd]
/usr/lib/nvidia-current/libcuda.so(cuInit+0x43)[0x7fd381b087d3]
./test[0x4214bc]
./test[0x422e31]
./test[0x4439e2]
./test[0x402a27]
./test[0x402955]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf5)[0x7fd3829d2ea5]
./test[0x402719]
======= Memory map: ========
这是我的整个申请:
主文件:test.cpp:
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include "util.h"
#include "utilCUDA.h"
using namespace std;
int main(int argc, char** argv)
{
int size = atoi(argv[1]);
double *a, *b, *c, *cBase;
int j;
a = (double*)malloc(sizeof(double));
b = (double*)malloc(sizeof(double));
c = (double*)malloc(sizeof(double));
cBase = (double*)malloc(sizeof(double));
srand(time(NULL));
for(j = 0; j < size; j++)
{
a[j] = rand() % 10;
b[j] = rand() % 10;
}
printVec(a,size);
printVec(b,size);
add(a,b,cBase,size);
printVec(cBase,size);
addCUDA(a,b,c,size);
printVec(c,size);
free(a);
free(b);
free(c);
free(cBase);
return 0;
}
util.h及其对应的util.cpp
#ifndef __UTIL_H__
#define __UTIL_H__
#include <stdio.h>
void add(double *a, double *b, double *c, int size);
void printVec(double *v, int size);
#endif
util.cpp:
#include "util.h"
void add(double *a, double *b, double * c, int N)
{
int tid = 0;
while (tid < N)
{
c[tid] = a[tid] + b[tid];
tid += 1;
}
}
void printVec(double *v, int size)
{
int i;
for(i = 0; i < size; i++)
printf("%f ", v[i]);
printf("\n");
}
utilCUDA.h:
#ifndef __UTILCUDA_H__
#define __UTILCUDA_H__
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
__global__ void myAdd(double *a, double *b, double *c, int size);
void addCUDA (double *a, double *b, double *c, int size);
#endif
utilCUDA.cu:
#include <stdio.h>
#include <stdlib.h>
#include "utilCUDA.h"
#define THREAD_PER_BLOCK 128
__global__ void myAdd( double *a, double *b, double *c, int size ) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;//blockIdx.x; // this thread handles the data at its thread id
if (tid < size)
c[tid] = a[tid] + b[tid];
}
void addCUDA(double *a, double *b, double *c, int size)
{
double *dev_a, *dev_b, *dev_c;
cudaMalloc( (void**)&dev_a, size * sizeof(double) );
cudaMalloc( (void**)&dev_b, size * sizeof(double) );
cudaMalloc( (void**)&dev_c, size * sizeof(double) );
cudaMemcpy( dev_a, a, size * sizeof(double),
cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size * sizeof(double),
cudaMemcpyHostToDevice );
myAdd<<<(size - 1)/THREAD_PER_BLOCK + 1,THREAD_PER_BLOCK>>>( dev_a, dev_b, dev_c,size );
cudaMemcpy( c, dev_c, size * sizeof(double),
cudaMemcpyDeviceToHost );
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
}
最后是Makefile:
CC = g++
CUDACC=nvcc
CFLAGS=-c -Wall
CUDAFLAGS=-c
SRC=test.cpp utilCUDA.cu
OBJ=test.o utilCUDA.o util.o
CUDA_INCLUDE = /usr/local/cuda-5.5/include
all: test
test: $(OBJ)
$(CUDACC) $(OBJ) -o test
test.o: test.cpp
$(CC) $(CFLAGS) test.cpp -I $(CUDA_INCLUDE)
utilCUDA.o: utilCUDA.cu utilCUDA.h
$(CUDACC) $(CUDAFLAGS) utilCUDA.cu
util.o: util.cpp util.h
$(CC) $(CFLAGS) util.cpp
clean:
rm -rf *.o test
答案 0 :(得分:3)
错误消息表明主机端内存分配存在问题。特别是,发布的代码仅为double
,a
,b
和c
分配了一个cBase
元素,尽管用法表示{的分配{1}}元素是有意的。尝试更改代码,如下所示:
size