用户编写内核中的推力

Question

用户编写内核中的推力

cudathrust

47

我是Thrust的新手。我发现所有Thrust的演示和例子都只展示主机端代码。

我想知道是否可以将device_vector传递给自己的内核？如何实现？如果可以，那么内核/设备端代码中允许进行哪些操作？

我是一名Thrust新手。我注意到所有的Thrust演示和例子都只展示主机端代码。

我想知道是否可以将 device_vector 传递给我的自定义内核？如果可以，如何实现呢？如果可以，那么在内核或设备端代码中，允许进行哪些操作？

- Ashwin Nanjappa

4个回答

21

这是我以前回答的内容的更新。

从Thrust 1.8.1开始，CUDA Thrust原语可以与 thrust::device 执行政策相结合，在单个CUDA线程中并行运行，利用CUDA 动态并行性。下面报告一个示例。

#include <stdio.h>

#include <thrust/reduce.h>
#include <thrust/execution_policy.h>

#include "TimingGPU.cuh"
#include "Utilities.cuh"

#define BLOCKSIZE_1D    256
#define BLOCKSIZE_2D_X  32
#define BLOCKSIZE_2D_Y  32

/*************************/
/* TEST KERNEL FUNCTIONS */
/*************************/
__global__ void test1(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {

    const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;

    if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::seq, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);

}

__global__ void test2(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {

    const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;

    if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::device, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);

}

/********/
/* MAIN */
/********/
int main() {

    const int Nrows = 64;
    const int Ncols = 2048;

    gpuErrchk(cudaFree(0));

//    size_t DevQueue;
//    gpuErrchk(cudaDeviceGetLimit(&DevQueue, cudaLimitDevRuntimePendingLaunchCount));
//    DevQueue *= 128;
//    gpuErrchk(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, DevQueue));

    float *h_data       = (float *)malloc(Nrows * Ncols * sizeof(float));
    float *h_results    = (float *)malloc(Nrows *         sizeof(float));
    float *h_results1   = (float *)malloc(Nrows *         sizeof(float));
    float *h_results2   = (float *)malloc(Nrows *         sizeof(float));
    float sum = 0.f;
    for (int i=0; i<Nrows; i++) {
        h_results[i] = 0.f;
        for (int j=0; j<Ncols; j++) {
            h_data[i*Ncols+j] = i;
            h_results[i] = h_results[i] + h_data[i*Ncols+j];
        }
    }

    TimingGPU timerGPU;

    float *d_data;          gpuErrchk(cudaMalloc((void**)&d_data,     Nrows * Ncols * sizeof(float)));
    float *d_results1;      gpuErrchk(cudaMalloc((void**)&d_results1, Nrows         * sizeof(float)));
    float *d_results2;      gpuErrchk(cudaMalloc((void**)&d_results2, Nrows         * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_data, h_data, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));

    timerGPU.StartCounter();
    test1<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing approach nr. 1 = %f\n", timerGPU.GetCounter());

    gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i=0; i<Nrows; i++) {
        if (h_results1[i] != h_results[i]) {
            printf("Approach nr. 1; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
            return 0;
        }
    }

    timerGPU.StartCounter();
    test2<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing approach nr. 2 = %f\n", timerGPU.GetCounter());

    gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i=0; i<Nrows; i++) {
        if (h_results1[i] != h_results[i]) {
            printf("Approach nr. 2; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
            return 0;
        }
    }

    printf("Test passed!\n");

}

上面的示例以与使用CUDA减少矩阵行相同的方式执行矩阵行归约，但与上述帖子不同的是，通过直接从用户编写的内核调用CUDA Thrust原语来完成。此外，上面的示例用于比较在两个执行策略thrust::seq和thrust::device下完成相同操作的性能差异。下面是一些显示性能差异的图表。

性能已在Kepler K20c和Maxwell GeForce GTX 850M上进行评估。

- Vitality

16

我希望为这个问题提供一个更新的答案。

从 Thrust 1.8 开始，CUDA Thrust 原语可以与 thrust::seq 执行策略结合使用，在单个 CUDA 线程（或单个 CPU 线程）中顺序运行。下面是一个示例。

如果您想要在线程内进行并行执行，则可以考虑使用 CUB，该库提供了可以从线程块内调用的规约例程，前提是您的显卡启用了动态并行性。

以下是使用 Thrust 的示例：

#include <stdio.h>

#include <thrust/reduce.h>
#include <thrust/execution_policy.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void test(float *d_A, int N) {

    float sum = thrust::reduce(thrust::seq, d_A, d_A + N);

    printf("Device side result = %f\n", sum);

}

int main() {

    const int N = 16;

    float *h_A = (float*)malloc(N * sizeof(float));
    float sum = 0.f;
    for (int i=0; i<N; i++) {
        h_A[i] = i;
        sum = sum + h_A[i];
    }
    printf("Host side result = %f\n", sum);

    float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice));

    test<<<1,1>>>(d_A, N);

}

- Vitality

6

如果你想使用thrust分配/处理的数据，是可以的，只需要获取所分配数据的原始指针。

int * raw_ptr = thrust::raw_pointer_cast(dev_ptr);

如果你想在内核中分配推力向量，我从未尝试过，但我认为不会起作用。即使它能够工作，我也不认为它会提供任何好处。

- fabrizioM

1

FabrizioM: 我希望我能够将一个device_vector传递给我的内核，并在内核内部调用size()函数。看起来目前这是不可能的。那我会使用raw_pointer_cast并将大小作为单独的参数发送到内核中。 - Ashwin Nanjappa

Ashwin：没错。你想做的事情是不可能的。你需要单独传递大小。 - Ade Miller

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- talonmies · Accepted Answer

Thrust最初的编写是作为主机端抽象来实现的。它不能在核函数内使用。您可以将封装在thrust::device_vector中的设备内存传递给自己的核函数，方法如下：

thrust::device_vector< Foo > fooVector;
// Do something thrust-y with fooVector

Foo* fooArray = thrust::raw_pointer_cast( fooVector.data() );

// Pass raw array and its size to kernel
someKernelCall<<< x, y >>>( fooArray, fooVector.size() );

你也可以通过使用裸的CUDA设备内存指针来实例化thrust::device_ptr，在thrust算法中使用未由thrust分配的设备内存。

四年半之后编辑，根据@JackOLantern的答案，thrust 1.8添加了顺序执行策略，这意味着您可以在设备上运行单线程版本的thrust算法。请注意，仍然不能直接将thrust设备向量传递给内核，并且不能在设备代码中直接使用设备向量。

请注意，在某些情况下，还可以使用thrust::device执行策略，以启动由内核作为子网格启动的并行thrust执行。这需要单独的编译/设备链接和支持动态并行性的硬件。我不确定这是否实际上支持所有thrust算法，但肯定适用于一些算法。