我有下面这段(可编译且可执行)使用CUDA Thrust来对float2
数组进行归约的代码。它能正常工作。
using namespace std;
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <conio.h>
#include <typeinfo>
#include <iostream>
// includes CUDA
#include <cuda.h>
#include <cuda_runtime.h>
// includes Thrust
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
// float2 + struct
struct add_float2 {
__device__ float2 operator()(const float2& a, const float2& b) const {
float2 r;
r.x = a.x + b.x;
r.y = a.y + b.y;
return r;
}
};
// double2 + struct
struct add_double2 {
__device__ double2 operator()(const double2& a, const double2& b) const {
double2 r;
r.x = a.x + b.x;
r.y = a.y + b.y;
return r;
}
};
void main( int argc, char** argv)
{
int N = 20;
// --- Host
float2* ha; ha = (float2*) malloc(N*sizeof(float2));
for (unsigned i=0; i<N; ++i) {
ha[i].x = 1;
ha[i].y = 2;
}
// --- Device
float2* da; cudaMalloc((void**)&da,N*sizeof(float2));
cudaMemcpy(da,ha,N*sizeof(float2),cudaMemcpyHostToDevice);
thrust::device_ptr<float2> dev_ptr_1(da);
thrust::device_ptr<float2> dev_ptr_2(da+N);
float2 init; init.x = init.y = 0.0f;
float2 sum = thrust::reduce(dev_ptr_1,dev_ptr_2,init,add_float2());
cout << " Real part = " << sum.x << "; Imaginary part = " << sum.y << endl;
getch();
}
然而,当我在
main
程序中将float2
更改为double2
时,即void main( int argc, char** argv)
{
int N = 20;
// --- Host
double2* ha; ha = (double2*) malloc(N*sizeof(double2));
for (unsigned i=0; i<N; ++i) {
ha[i].x = 1;
ha[i].y = 2;
}
// --- Device
double2* da; cudaMalloc((void**)&da,N*sizeof(double2));
cudaMemcpy(da,ha,N*sizeof(double2),cudaMemcpyHostToDevice);
thrust::device_ptr<double2> dev_ptr_1(da);
thrust::device_ptr<double2> dev_ptr_2(da+N);
double2 init; init.x = init.y = 0.0;
double2 sum = thrust::reduce(dev_ptr_1,dev_ptr_2,init,add_double2());
cout << " Real part = " << sum.x << "; Imaginary part = " << sum.y << endl;
getch();
}
我在reduce
行收到一个exception
。我如何使用CUDA Thrust对double2
数组进行缩减?我有做错什么吗?提前感谢。
根据TALONMIES的答案,以下是可行的解决方案:
using namespace std;
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <conio.h>
#include <typeinfo>
#include <iostream>
// includes CUDA
#include <cuda.h>
#include <cuda_runtime.h>
// includes Thrust
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
struct my_double2 {
double x, y;
};
// double2 + struct
struct add_my_double2 {
__device__ my_double2 operator()(const my_double2& a, const my_double2& b) const {
my_double2 r;
r.x = a.x + b.x;
r.y = a.y + b.y;
return r;
}
};
void main( int argc, char** argv)
{
int N = 20;
// --- Host
my_double2* ha; ha = (my_double2*) malloc(N*sizeof(my_double2));
for (unsigned i=0; i<N; ++i) {
ha[i].x = 1;
ha[i].y = 2;
}
// --- Device
my_double2* da; cudaMalloc((void**)&da,N*sizeof(my_double2));
cudaMemcpy(da,ha,N*sizeof(my_double2),cudaMemcpyHostToDevice);
thrust::device_ptr<my_double2> dev_ptr_1(da);
thrust::device_ptr<my_double2> dev_ptr_2(da+N);
my_double2 init; init.x = init.y = 0.0;
cout << "here3\n";
my_double2 sum = thrust::reduce(dev_ptr_1,dev_ptr_2,init,add_my_double2());
cout << " Real part = " << sum.x << "; Imaginary part = " << sum.y << endl;
getch();
}
double2
定义为struct
。性能如何?过去我曾尝试过使用我的double2
版本作为一对double
数字,但我意识到内部使用double2
的包装器类的解决方案比一对double
更快。 - Vitalitydouble2
与结构体之间的区别 - 我想你可能不知道double2
实际上就是一个简单的结构体(请查看vector_types.h
)。使用CUDA向量类型和使用自己的功能等效结构体在生成的代码或性能上应该没有任何区别(但根据我的经验并非如此)。 - talonmiesdouble2
是一个struct
,但在定义前面有一些特殊的关键字:__device_builtin__ __builtin_align__(16)
。据我所知,它们应该允许协同内存访问。相比不使用对齐的自定义定义,这样做会带来一些改进吗? - Vitality__device__builtin__
只是一个标签,它没有任何作用,而__builtin__align__
解析为基本的__align__
宏。CUDA“builin”向量类型只是简单的结构体,它们依赖于编译器分析来实现合并,就像任何用户定义的类型一样。它们真正有意义的地方只在于与纹理硬件的关系,即使如此,也仅仅是为了让类型大小和对齐方式与纹理API所期望的相匹配。 - talonmies