CUDA分配数组的数组

Question

CUDA分配数组的数组

5

我在CUDA中分配数组的数组遇到了一些问题。

void ** data;
cudaMalloc(&data, sizeof(void**)*N); // allocates without problems
for(int i = 0; i < N; i++) {
    cudaMalloc(data + i, getSize(i) * sizeof(void*)); // seg fault is thrown
}

我错在哪里了？

- user216179

这个程序的主要问题是所有这些数组的大小都不同。 - user216179

5个回答

4

我不认为这是被支持的。 cudaMalloc() 分配设备内存，但将地址存储在主机上的变量中。在您的 for 循环中，您正在传递设备内存中的地址。

根据您想要完成的任务，您可能希望在调用当前的 for 循环之前使用普通主机 malloc() 分配 data。或手动分配单个大块设备内存并计算偏移量。

请参阅 CUDA Programming Guide 的第 2.4 节、3.2.1 节和 B.2.5（底部）进行更多讨论。具体来说，在第 108 页的底部：

通过取 __device__、__shared__ 或 __constant__ 变量的地址获得的地址只能在设备代码中使用。

- Gabriel

1

这是正确的。cudaMalloc分配的指针必须驻留在主机内存中，而您正在尝试将这些指针存储在设备内存中。相反，您应该在主机上创建指针数组，然后在最后将其复制到设备上，或者像Gabriel建议的那样计算偏移量。 - Tom

2

您不能使用

cudaMalloc(&h_array[i], i * sizeof(void*));

对于声明为void *的数组

使用定义好的数据类型

CUdeviceptr *h_array = malloc(sizeof(CUdeviceptr *) * N);

或者

int *h_array = malloc(sizeof(int *) * N);

将其转换为 void *

cudaMalloc((void *)&h_array[i], i * sizeof(void*));

- yurijgera

2

我认为在第一个循环中应该是&h_array[i]而不是&d_array[i]。

- Jamshidi

1

我曾经遇到过同样的问题，并成功解决了它。

FabrizioM的回答对我来说是一个很好的起点，帮助了我很多。但是当我尝试将代码转移到我的项目时，仍然遇到了一些问题。通过使用其他评论和帖子，我能够编写一个可工作的示例（VS2012，CUDA7.5）。因此，我将把我的代码作为额外的答案发布，并作为其他人开始的起点。

为了理解命名：我正在使用OpenCV cv::Mat的向量作为输入，这些向量从多个相机中捕获，并在内核中处理这些图像。

     void TransferCameraImageToCuda(const std::vector<cv::Mat*>* Images)
{

     int NumberCams     = Images->size();
     int imageSize      = Images->at(0)->cols*Images->at(0)->rows;

     CUdeviceptr*           CamArraysAdressOnDevice_H;
     CUdeviceptr*           CamArraysAdressOnDevice_D;


         //allocate memory on host to store the device-address of each array
         CamArraysAdressOnDevice_H = new CUdeviceptr[NumberCams];

         // allocate memory on the device and store the arrays on the device 
         for (int i = 0; i < NumberCams; i++){
             cudaMalloc((void**)&(CamArraysAdressOnDevice_H[i]), imageSize * sizeof(unsigned short));
             cudaMemcpy((void*)CamArraysAdressOnDevice_H[i], Images->at(i)->data, imageSize * sizeof(unsigned short), cudaMemcpyHostToDevice);
         }

         // allocate memory on the device to store the device-adresses of the arrays
         cudaMalloc((void**)&CamArraysAdressOnDevice_D, sizeof(CUdeviceptr*)* NumberCams);

         // Copy the adress of each device array to the device
         cudaMemcpy(CamArraysAdressOnDevice_D, CamArraysAdressOnDevice_H, sizeof(CUdeviceptr*)* NumberCams, cudaMemcpyHostToDevice);




}

在内核启动时，我将设备指针转换为数据类型指针（unsigned short **）。

DummyKernel<<<gridDim,blockDim>>>(NumberCams, (unsigned short**) CamArraysAdressOnDevice_D)

而内核定义的示例如下：

__global__ void DummyKernel(int NumberImages, unsigned short** CamImages)
{
    int someIndex = 3458;
    printf("Value Image 0 : %d \n", CamImages[0][someIndex]);
    printf("Value Image 1 : %d \n", CamImages[1][someIndex]);
    printf("Value Image 2 : %d \n", CamImages[2][someIndex]);
}

- Christian_B

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- fabrizioM · Accepted Answer

您需要为主机内存分配指针，然后为每个数组分配设备内存并将其指针存储在主机内存中。然后为存储指针的内存分配设备内存，最后将主机内存复制到设备内存。一个例子胜过千言万语：

__global__ void multi_array_kernel( int N, void** arrays ){
    // stuff
}


int main(){

    const int N_ARRAYS = 20;
    void *h_array = malloc(sizeof(void*) * N_ARRAYS);
    for(int i = 0; i < N_ARRAYS; i++){
        cudaMalloc(&h_array[i], i * sizeof(void*));
        //TODO: check error
    }
    void *d_array = cudaMalloc(sizeof(void*) * N_ARRAYS);

    // Copy to device Memory
    cudaMemcpy(d_array, h_array, sizeof(void*) * N_ARRAYS, cudaMemcpyHostToDevice);

    multi_array_kernel<1,1>(N_ARRAYS, d_array);
    cudaThreadSynchronize();

    for(int i = 0; i < N_ARRAYS; i++){
        cudaFree(h_array[i]); //host not device memory
        //TODO: check error
    }
    cudaFree(d_array);
    free(h_array);
}