作为使用CUDA Thrust的明显简单替代方案,我在下面发布了一个使用CUDA实现经典Matlab中“meshgrid”函数的示例。
在Matlab中:
x = [1 2 3];
y = [4 5 6 7];
[X, Y] = meshgrid(x, y);
产生
X =
1 2 3
1 2 3
1 2 3
1 2 3
并且
Y =
4 4 4
5 5 5
6 6 6
7 7 7
X
是 x
数组的四倍复制,这是 OP 的问题和 Robert Crovella 答案的第一个猜测,而 Y
是每个 y
数组元素的三倍连续复制,这是 Robert Crovella 答案的第二个猜测。
以下是代码:
#include <cstdio>
#include <thrust/pair.h>
#include "Utilities.cuh"
#define BLOCKSIZE_MESHGRID_X 16
#define BLOCKSIZE_MESHGRID_Y 16
#define DEBUG
template <class T>
__global__ void meshgrid_kernel(const T * __restrict__ x, size_t Nx, const float * __restrict__ y, size_t Ny, T * __restrict__ X, T * __restrict__ Y)
{
unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if ((tidx < Nx) && (tidy < Ny)) {
X[tidy * Nx + tidx] = x[tidx];
Y[tidy * Nx + tidx] = y[tidy];
}
}
template <class T>
thrust::pair<T *,T *> meshgrid(const T *x, const unsigned int Nx, const T *y, const unsigned int Ny) {
T *X; gpuErrchk(cudaMalloc((void**)&X, Nx * Ny * sizeof(T)));
T *Y; gpuErrchk(cudaMalloc((void**)&Y, Nx * Ny * sizeof(T)));
dim3 BlockSize(BLOCKSIZE_MESHGRID_X, BLOCKSIZE_MESHGRID_Y);
dim3 GridSize (iDivUp(Nx, BLOCKSIZE_MESHGRID_X), iDivUp(BLOCKSIZE_MESHGRID_Y, BLOCKSIZE_MESHGRID_Y));
meshgrid_kernel<<<GridSize, BlockSize>>>(x, Nx, y, Ny, X, Y);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
return thrust::make_pair(X, Y);
}
int main()
{
const int Nx = 3;
const int Ny = 4;
float *h_x = (float *)malloc(Nx * sizeof(float));
float *h_y = (float *)malloc(Ny * sizeof(float));
float *h_X = (float *)malloc(Nx * Ny * sizeof(float));
float *h_Y = (float *)malloc(Nx * Ny * sizeof(float));
for (int i = 0; i < Nx; i++) h_x[i] = i;
for (int i = 0; i < Ny; i++) h_y[i] = i + 4.f;
float *d_x; gpuErrchk(cudaMalloc(&d_x, Nx * sizeof(float)));
float *d_y; gpuErrchk(cudaMalloc(&d_y, Ny * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x, h_x, Nx * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, Ny * sizeof(float), cudaMemcpyHostToDevice));
thrust::pair<float *, float *> meshgrid_pointers = meshgrid(d_x, Nx, d_y, Ny);
float *d_X = (float *)meshgrid_pointers.first;
float *d_Y = (float *)meshgrid_pointers.second;
gpuErrchk(cudaMemcpy(h_X, d_X, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_Y, d_Y, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
for (int j = 0; j < Ny; j++) {
for (int i = 0; i < Nx; i++) {
printf("i = %i; j = %i; x = %f; y = %f\n", i, j, h_X[j * Nx + i], h_Y[j * Nx + i]);
}
}
return 0;
}
,您想要得到
{1, 1, 1, ..., 2, 2, 2, ..., 3, 3, 3, ...}还是
{1, 2, 3, 1, 2, 3, ...}`或者您想要其他东西? - Robert Crovella{1, 2, 3, 1, 2, 3, .... }
。 - fahad