我正在尝试分配设备内存,复制到它,在GPU上执行计算,将结果复制回来然后释放我分配的设备内存。我想确保我没有超出限制,我想看看我是否在共享内存空间中有足够的内存来转储一些数组。
当我分配设备内存时,不会返回任何错误。当我使用cudaMemGetInfo
检查分配的内存量时,看起来一个cudaMalloc
没有分配任何内存。
此外,当我尝试释放内存时,看起来只有一个指针被释放。
我正在使用matlab Mexfunction
接口来设置GPU内存并启动内核。此时,我甚至没有调用内核,只返回结果的单位矩阵。
cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);
/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;
/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));
/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* call the kernel */
// launchKernel<<<1,512>>>(........);
/* retireve the output */
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* free the memory. */
mexPrintf("\nFree'ing memory.\n");
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);
cudaErr = cudaFree(devicePulseDelay);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceTarDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScattDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScatterers);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedReal);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedImag);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
以下是此输出:
Memory avaliable: Free: 2523959296, Total: 2818572288 Allocating memory. devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Copying memory. devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Free'ing memory. Before freeing: Free 2513473536, Total: 2818572288 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
我觉得有一些明显的东西我不知道了。任何人都可以帮助解释发生了什么?
编辑:平台是带有Tesla C2050 GPu卡的Windows 7。
答案 0 :(得分:12)
一种常见的误解是malloc
在被调用时直接从主机操作系统获取内存分配,free
在调用时直接将它们释放回主机操作系统。但它们几乎总是不那样工作,而是标准库维护一个自由和malloc内存的循环列表,通过与主机操作系统的交互进行机会扩展和收缩(参见{{3的一些答案)如果您有兴趣了解更多细节)。无论它是如何工作的,这都会导致许多非直观的结果,包括通常不可能分配尽可能多的内存,因为操作系统说是免费的,分配有时似乎不会改变可用内存的数量,并且free
有时对OS所说的内存量没有影响。
虽然我只有经验证据支持这一点,但我相信CUDA的工作方式完全相同。上下文维护自己的malloc'd和free'd内存列表,并将扩展和收缩该列表中保存的内存作为主机驱动程序/窗口管理器,GPU本身允许。所有硬件都具有特定的MMU页面大小,并且有证据表明NVIDIA GPU上的页面大小相当大。这意味着cudaMalloc
调用中的粒度相当粗糙,并且有时意味着malloc
似乎不会影响可用内存量或消耗比请求的内存更多的内存,有时free
调用似乎没有任何效果(如果您感兴趣,可以找到一个有助于说明CUDA驱动程序How do malloc() and free() work?的页面大小行为的小工具,尽管它是为早期版本的CUDA API编写的,可能需要用现代版本编译的几个变化)。我相信这是你正在观察的行为的最可能的解释。
顺便提一下,如果我使用GT200系列设备运行您在MacOS 10.6上发布的简化版代码:
#include <cstdio>
#define mexPrintf printf
inline void gpuAssert(cudaError_t code, char *file, int line,
bool abort=true)
{
if (code != cudaSuccess)
{
mexPrintf("GPUassert: %s %s %d\n", cudaGetErrorString(code),
file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuMemReport(size_t * avail, size_t * total,
const char * title = 0, const size_t * free = 0, const bool sense = true)
{
char tstring[32] = { '\0' };
gpuErrchk( cudaMemGetInfo(avail, total) );
if (free) {
if (title) {
strncpy(tstring, title, 31);
}
mexPrintf("%s Memory avaliable: Free: %zu, Total: %zu, %s: %zu\n",
tstring, *avail, *total, (sense) ? "Allocated\0" : "Freed\0",
(sense) ? (*free - *avail) : (*avail - *free));
} else {
mexPrintf("Memory avaliable: Free: %zu, Total: %zu\n", *avail, *total);
}
}
int main()
{
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
gpuErrchk( cudaFree(0) );
gpuMemReport(&freeMem, &totalMem);
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;
mexPrintf("Allocating memory.\n");
gpuErrchk( cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512) );
gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem);
gpuErrchk( cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512) );
gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem);
gpuErrchk( cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512) );
gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem);
gpuErrchk( cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999) );
gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem);
gpuErrchk( cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512) );
gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem);
gpuErrchk( cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512) );
gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem);
mexPrintf("\nFree'ing memory.\n");
gpuMemReport(&freeMem, &totalMem);
gpuErrchk( cudaFree(devicePulseDelay) );
gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem, false);
gpuErrchk( cudaFree(deviceTarDistance) );
gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem, false);
gpuErrchk( cudaFree(deviceScattDistance) );
gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem, false);
gpuErrchk( cudaFree(deviceScatterers) );
gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem, false);
gpuErrchk( cudaFree(deviceReceivedReal) );
gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem, false);
gpuErrchk( cudaFree(deviceReceivedImag) );
gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem, false);
return 0;
}
我得到了不同的结果,但也有一个显示相同的现象:
Allocating memory.
devicePulseDelay: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceTarDistance: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceScattDistance: Memory avaliable: Free: 198778880, Total: 265027584, Allocated: 5140480
deviceScatterers: Memory avaliable: Free: 197730304, Total: 265027584, Allocated: 6189056
deviceReceivedReal: Memory avaliable: Free: 193638400, Total: 265027584, Allocated: 10280960
deviceReceivedImag: Memory avaliable: Free: 189546496, Total: 265027584, Allocated: 14372864
Free'ing memory.
Memory avaliable: Free: 189546496, Total: 265027584
devicePulseDelay: Memory avaliable: Free: 189546496, Total: 265027584, Freed: 0
deviceTarDistance: Memory avaliable: Free: 190595072, Total: 265027584, Freed: 1048576
deviceScattDistance: Memory avaliable: Free: 194686976, Total: 265027584, Freed: 5140480
deviceScatterers: Memory avaliable: Free: 195735552, Total: 265027584, Freed: 6189056
deviceReceivedReal: Memory avaliable: Free: 199827456, Total: 265027584, Freed: 10280960
deviceReceivedImag: Memory avaliable: Free: 203919360, Total: 265027584, Freed: 14372864
这表明该行为也依赖于硬件/主机操作系统。