我遇到了一个我无法理解的内存分配问题。我试图在GPU中分配一个char数组(我猜它可能是一个内存碎片问题)。
这是我的代码,
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<cuda.h>
inline void gpuAssert(cudaError_t code, char *file, int line,
int abort=1)
{
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
__global__ void calc(char *k,char *i)
{
*i=*k;
}
int main()
{
char *dev_o=0;
char *i;
i = (char*)malloc(10*sizeof(char));
cudaMalloc((void**)&dev_o,10*sizeof(char)); //Line 31
calc<<<1,1>>>("arun",dev_o);
gpuErrchk(cudaMemcpy(&i,dev_o,10*sizeof(char),cudaMemcpyDeviceToHost));
cudaFree(dev_o);
printf("string : %s \n",i);
return 0;
}
但是我得到了输出,
GPUassert:内存不足sample2.cu 31
在同样的情况下,我尝试在GPU中分配整数并且它正常工作。
我的GPU设备信息为,
--- General Information for device 0 ---
Name:GeForce GTX 460 SE
Compute capability:2.1
Clock rate:1296000
Device copy overlap:Enabled
Kernel execition timeout :Enabled
--- Memory Information for device 0 ---
Total global mem:1073283072
Total constant Mem:65536
Max mem pitch:2147483647
Texture Alignment:512
--- MP Information for device 0 ---
Multiprocessor count:6
Shared mem per mp:49152
Registers per mp:32768
Threads in warp:32
Max threads per block:1024
Max thread dimensions:(1024, 1024, 64)
Max grid dimensions:(65535, 65535, 65535)
谁能告诉我这是什么问题以及如何克服它?
答案 0 :(得分:2)
您的代码中出现了一些问题。
cudaMemcpy(&i, ...)
应为cudaMemcpy(i, ...)
。检查内核调用的返回错误,如this post中所述。如果不这样做,错误似乎会在您的代码中出现。
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
char *k
参数是主机指针。在调用内核之前,您应该创建另一个设备阵列并将数据复制到设备。calc()
,因此您在threadIdx.x
内核中的线程上也没有进行任何并行工作。这可能是为了测试。如果您解决了这些问题,您将获得以下内容:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<cuda.h>
inline void gpuAssert(cudaError_t code, char *file, int line,
int abort=1)
{
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
__global__ void calc(char* k, char *i)
{
i[threadIdx.x] = k[threadIdx.x];
}
int main()
{
const char* msg = "arun";
char *dev_i, *dev_k;
char *i, *k;
k = (char*)malloc(10*sizeof(char));
i = (char*)malloc(10*sizeof(char));
sprintf(k, msg);
cudaMalloc((void**)&dev_i, 10*sizeof(char));
cudaMalloc((void**)&dev_k, 10*sizeof(char));
gpuErrchk(cudaMemcpy(dev_k, k, 10*sizeof(char), cudaMemcpyHostToDevice));
calc<<<1,5>>>(dev_k, dev_i);
gpuErrchk(cudaPeekAtLastError());
// Synchronization will be done in the next synchronous cudaMemCpy call, else
// you would need cudaDeviceSynchronize() to detect execution errors.
//gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(i, dev_i, 10*sizeof(char), cudaMemcpyDeviceToHost));
printf("string : %s\n", i);
cudaFree(dev_i);
cudaFree(dev_k);
free(i);
free(k);
return 0;
}