为什么CudaFree似乎无法释放内存？

Question

为什么CudaFree似乎无法释放内存？

10

我正在尝试分配设备内存，将数据拷贝到其中，在GPU上执行计算，将结果拷贝回来，然后释放我分配的设备内存。我想确认是否超过限制，并查看共享内存空间是否足够倾销一些数组。

当我分配设备内存时，没有返回错误。当我使用cudaMemGetInfo检查分配的内存量时，似乎一个cudaMalloc没有分配任何内存。而且，当我尝试释放内存时，似乎只有一个指针被释放了。

我正在使用Matlab的Mexfunction接口设置GPU内存并启动内核。此时，我甚至没有调用内核，只是返回一个单位矩阵作为结果。

cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);  

/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;

/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceScattDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{   
    mexPrintf("could not allocate memory to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}  
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedReal\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));

/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);   
if (cudaErr != cudaSuccess)
{  
    mexPrintf("could not copy to deviceScattDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));  

/* call the kernel */
// launchKernel<<<1,512>>>(........);   

/* retireve the output */  
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);   
if (cudaErr != cudaSuccess)
{   
    mexPrintf("could not copy to receivedReal\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); 
if (cudaErr != cudaSuccess)
{ 
    mexPrintf("could not copy to receivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));   

/* free the memory. */ 
mexPrintf("\nFree'ing memory.\n");   
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);  
cudaErr = cudaFree(devicePulseDelay); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free devicePulseDelay\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceTarDistance);   
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could free deviceTarDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScattDistance);   
if (cudaErr != cudaSuccess) 
{   
    mexPrintf("could free deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}   
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScatterers);  
if (cudaErr != cudaSuccess) 
{   
    mexPrintf("could free deviceScatterers\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceReceivedReal);  
if (cudaErr != cudaSuccess) 
{  
    mexPrintf("could free deviceReceivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceReceivedImag);   
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceReceivedImag\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

这是输出结果：

 
内存可用：Free: 2523959296, 总共：2818572288 
正在分配内存。 
devicePulseDelay：内存可用：Free: 2522910720, 总共：2818572288，已用：1048576 
deviceTarDistance：内存可用：Free: 2522910720, 总共：2818572288，已用：1048576 
deviceScattDistance：内存可用：Free: 2518716416, 总共：2818572288，已用：5242880 
deviceScatterers：内存可用：Free: 2517667840, 总共：2818572288，已用：6291456 
deviceReceivedReal：内存可用：Free: 2515570688, 总共：2818572288，已用：8388608 
deviceReceivedImag：内存可用：Free: 2513473536, 总共：2818572288，已用：10485760
正在复制内存。 
devicePulseDelay：内存可用：Free: 2513473536, 总共：2818572288，已用：10485760 
deviceTarDistance：内存可用：Free: 2513473536, 总共：2818572288，已用：10485760 
deviceScattDistance：内存可用：Free: 2513473536, 总共：2818572288，已用：10485760 
deviceScatterers：内存可用：Free: 2513473536, 总共：2818572288，已用：10485760 
receivedReal：内存可用：Free: 2513473536, 总共：2818572288，已用：10485760 
receivedImag：内存可用：Free: 2513473536, 总共：2818572288，已用：10485760
释放内存。 
释放前：Free 2513473536，总共：2818572288 
devicePulseDelay：内存可用：Free: 2513473536, 总共：2818572288，已释放：0 
deviceTarDistance：内存可用：Free: 2513473536, 总共：2818572288，已释放：0 
deviceScattDistance：内存可用：Free: 2513473536, 总共：2818572288，已释放：0 
deviceScatterers：内存可用：Free: 2514522112, 总共：2818572288，已释放：1048576 
deviceReceivedReal：内存可用：Free: 2514522112, 总共：2818572288，已释放：1048576 
deviceReceivedImag：内存可用：Free: 2514522112, 总共：2818572288，已释放：1048576

感觉有什么我忽略的地方，请有人可以帮忙解释一下吗？

编辑：平台是Windows 7，搭载了Tesla C2050 GPU显卡。

- Beau Bellamy

你在哪个平台上运行这段代码？ - talonmies

在每次调用cudaMemGetInfo()之前，尝试将allocMem和totalMem的值清零，并检查cudaMemGetInfo()的返回值。 - Roger Dahl

在每次调用cudaMemGetInfo（）之前将allocMem和totalMem归零并没有任何区别。此外，cudaMemGetInfo调用未返回任何错误。顺便说一下，我的平台是Tesla C2050 GPU卡上的Windows 7。 - Beau Bellamy

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- talonmies · Accepted Answer

常见的误解是malloc在调用时会直接从主机操作系统获取内存分配，而free在调用时会将它们直接释放回主机操作系统。但它们几乎总是不是这样工作的，标准库维护了一个被释放和分配的内存的循环列表，通过与主机操作系统交互来机会性地扩展和收缩它（如果您有兴趣，可以查看一些有关此问题的答案How do malloc() and free() work?）。无论其如何工作，这导致了许多非直观的结果，包括通常无法分配与操作系统报告的可用内存数量相同的内存，分配有时似乎不会改变可用内存量，而free有时似乎对操作系统报告的可用内存量没有影响。

虽然我只有经验证据支持这一点，但我相信CUDA的工作方式完全相同。上下文维护其自己的malloc'd和free'd内存列表，并将该列表中保存的内存随着主机驱动程序/窗口管理器和GPU本身的允许而扩展和收缩。所有硬件都具有特征的MMU页大小，有证据表明NVIDIA GPU上的页面大小相当大。这意味着cudaMalloc调用的粒度相当粗糙，有时malloc似乎不会影响可用内存量或消耗比请求的内存更多的内存，而有时free调用似乎没有任何效果（如果您有兴趣，可以找到一个小工具，它有助于说明CUDA驱动程序的页面大小行为here，虽然它是为早期版本的CUDA API编写的，并可能需要一些改变才能在现代版本中编译）。我相信这是您观察到的行为的最可能解释。

顺便说一句，如果我在MacOS 10.6上运行您发布的简化代码并使用GT200系列设备：

#include <cstdio>

#define mexPrintf printf

inline void gpuAssert(cudaError_t code, char *file, int line, 
                 bool abort=true)
{
   if (code != cudaSuccess) 
   {
      mexPrintf("GPUassert: %s %s %d\n", cudaGetErrorString(code),
          file, line);
      if (abort) exit(code);
   }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuMemReport(size_t * avail, size_t * total, 
        const char * title = 0, const size_t * free = 0, const bool sense = true) 
{
    char tstring[32] = { '\0' };
    gpuErrchk( cudaMemGetInfo(avail, total) );  

    if (free) {
        if (title) {
            strncpy(tstring, title, 31);
        }
        mexPrintf("%s Memory avaliable: Free: %zu, Total: %zu, %s: %zu\n",
                tstring, *avail, *total, (sense) ? "Allocated\0" : "Freed\0", 
                (sense) ? (*free - *avail) : (*avail - *free));
    } else {
        mexPrintf("Memory avaliable: Free: %zu, Total: %zu\n", *avail, *total);  
    }
}

int main()
{
    size_t freeMem = 0;
    size_t totalMem = 0;
    size_t allocMem = 0;

    gpuErrchk( cudaFree(0) );
    gpuMemReport(&freeMem, &totalMem);

    double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
    double *deviceReceivedReal, *deviceReceivedImag;

    mexPrintf("Allocating memory.\n");
    gpuErrchk( cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512) );
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999) );
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem);

    mexPrintf("\nFree'ing memory.\n");   
    gpuMemReport(&freeMem, &totalMem);

    gpuErrchk( cudaFree(devicePulseDelay) ); 
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem, false);

    gpuErrchk( cudaFree(deviceTarDistance) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem, false);

    gpuErrchk( cudaFree(deviceScattDistance) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem, false);

    gpuErrchk( cudaFree(deviceScatterers) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem, false);

    gpuErrchk( cudaFree(deviceReceivedReal) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem, false);

    gpuErrchk( cudaFree(deviceReceivedImag) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem, false);

    return 0;
}

我得到了不同的结果，但也展示了相同的现象：

Allocating memory.
devicePulseDelay: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceTarDistance: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceScattDistance: Memory avaliable: Free: 198778880, Total: 265027584, Allocated: 5140480
deviceScatterers: Memory avaliable: Free: 197730304, Total: 265027584, Allocated: 6189056
deviceReceivedReal: Memory avaliable: Free: 193638400, Total: 265027584, Allocated: 10280960
deviceReceivedImag: Memory avaliable: Free: 189546496, Total: 265027584, Allocated: 14372864

Free'ing memory.
Memory avaliable: Free: 189546496, Total: 265027584
devicePulseDelay: Memory avaliable: Free: 189546496, Total: 265027584, Freed: 0
deviceTarDistance: Memory avaliable: Free: 190595072, Total: 265027584, Freed: 1048576
deviceScattDistance: Memory avaliable: Free: 194686976, Total: 265027584, Freed: 5140480
deviceScatterers: Memory avaliable: Free: 195735552, Total: 265027584, Freed: 6189056
deviceReceivedReal: Memory avaliable: Free: 199827456, Total: 265027584, Freed: 10280960
deviceReceivedImag: Memory avaliable: Free: 203919360, Total: 265027584, Freed: 14372864

这表明行为也取决于硬件/主机操作系统。