CUDA统一内存和Windows 10

Question

CUDA统一内存和Windows 10

4

使用CudaMallocManaged()分配一个包含数组的结构体数组时，出现“内存不足”的错误，尽管我有足够的空闲内存。以下是一些重现问题的代码：

#include <iostream>
#include <cuda.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#define N 100000
#define ARR_SZ 100

struct Struct
{
    float* arr;
};

int main()
{
    Struct* struct_arr;

    gpuErrchk( cudaMallocManaged((void**)&struct_arr, sizeof(Struct)*N) );
    for(int i = 0; i < N; ++i)
        gpuErrchk( cudaMallocManaged((void**)&(struct_arr[i].arr), sizeof(float)*ARR_SZ) ); //out of memory...

    for(int i = 0; i < N; ++i)
        cudaFree(struct_arr[i].arr);
    cudaFree(struct_arr);

    /*float* f;
    gpuErrchk( cudaMallocManaged((void**)&f, sizeof(float)*N*ARR_SZ) ); //this works ok
    cudaFree(f);*/

    return 0;
}

在我调用cudaMallocManaged()一次来分配单个内存块时，似乎没有问题，就像我在最后一段注释代码中所示。我有一台GeForce GTX 1070 Ti，并且正在使用Windows 10。一个朋友试图在运行Linux的PC上编译相同的代码，它可以正确工作，而在另一台运行Windows 10的PC上也遇到了相同的问题。 WDDM TDR已被禁用。任何帮助将不胜感激。谢谢。

- Julian

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Robert Crovella · Accepted Answer

有一个分配粒度。

这意味着，如果你请求1个字节或400个字节，实际使用的是类似于 ~~4096~~ 65536 字节的东西。因此，大量很小的分配实际上会以比基于请求的分配大小预测的更快的速率使用内存。解决方案是不要进行非常小的分配，而是分配更大的块。

在这里的另一种替代策略也可以是展开你的分配，并从中雕刻出每个数组的部分：

#include <iostream>
#include <cstdio>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#define N 100000
#define ARR_SZ 100

struct Struct
{
    float* arr;
};

int main()
{
    Struct* struct_arr;
    float* f;

    gpuErrchk( cudaMallocManaged((void**)&struct_arr, sizeof(Struct)*N) );
    gpuErrchk( cudaMallocManaged((void**)&f, sizeof(float)*N*ARR_SZ) );
    for(int i = 0; i < N; ++i)
        struct_arr[i].arr = f+i*ARR_SZ;
    cudaFree(struct_arr);
    cudaFree(f);

    return 0;
}

ARR_SZ 能被4整除意味着各种创建的指针也可以向更大的向量类型进行向上转换，例如 float2 或者 float4，如果你打算使用这个功能。

在Linux上原始代码能够工作的一个可能原因是，在适当的设置下，Linux上的托管内存可以超额订阅GPU物理内存。结果是实际的分配限制比GPU板载内存所建议的要高得多。这也可能是Linux情况下有一些更多的空闲内存，或者分配粒度比较不同（更小）。

基于评论中的一个问题，我决定使用以下代码来估计分配粒度：

#include <iostream>
#include <cstdio>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

#define N 100000
#define ARR_SZ 100

struct Struct
{
    float* arr;
};

int main()
{
    Struct* struct_arr;
    //float* f;

    gpuErrchk(cudaMallocManaged((void**)& struct_arr, sizeof(Struct) * N));
#if 0
    gpuErrchk(cudaMallocManaged((void**)& f, sizeof(float) * N * ARR_SZ));
    for (int i = 0; i < N; ++i)
        struct_arr[i].arr = f + i * ARR_SZ;
#else
    size_t fre, tot;
    gpuErrchk(cudaMemGetInfo(&fre, &tot));
    std::cout << "Free: " << fre << " total: " << tot << std::endl;
    for (int i = 0; i < N; ++i)
        gpuErrchk(cudaMallocManaged((void**) & (struct_arr[i].arr), sizeof(float) * ARR_SZ)); 
    gpuErrchk(cudaMemGetInfo(&fre, &tot));
    std::cout << "Free: " << fre << " total: " << tot << std::endl;

    for (int i = 0; i < N; ++i)
        cudaFree(struct_arr[i].arr);
#endif
    cudaFree(struct_arr);
    //cudaFree(f);

    return 0;
}

当我使用该代码编译调试项目，并在一台搭载RTX 2070 GPU（8GB内存，和GTX 1070 Ti相同）的Windows 10桌面电脑上运行时，会得到以下输出：

Microsoft Windows [Version 10.0.17763.973]
(c) 2018 Microsoft Corporation. All rights reserved.

C:\Users\Robert Crovella>cd C:\Users\Robert Crovella\source\repos\test12\x64\Debug

C:\Users\Robert Crovella\source\repos\test12\x64\Debug>test12
Free: 7069866393 total: 8589934592
Free: 516266393 total: 8589934592

C:\Users\Robert Crovella\source\repos\test12\x64\Debug>test12
Free: 7069866393 total: 8589934592
Free: 516266393 total: 8589934592

C:\Users\Robert Crovella\source\repos\test12\x64\Debug>

Note that on my machine there is only 0.5GB of reported free memory left after the 100,000 allocations. So if for any reason your 8GB GPU starts out with less free memory (entirely possible) you may run into an out-of-memory error, even though I did not.
The calculation of the allocation granularity is as follows:
```
7069866393 - 516266393 / 100000 = 65536 bytes per allocation(!)
```
So my previous estimate of 4096 bytes per allocation was way off, by at least 1 order of magnitude, on my machine/test setup.
The allocation granularity may vary based on:
- windows or linux
- WDDM or TCC
- x86 or Power9
- managed vs ordinary cudaMalloc
- possibly other factors (e.g. CUDA version)
so my advice to future readers would not be to assume that it is always 65536 bytes per allocation, minimum.