我正在将njuffa的评论扩展为一个实际示例。在该示例中,我仅以三种不同的方式添加两个数组:将数据加载为float
、float2
或float4
。
这些是在GT540M和Kepler K20c卡上的时间:
=>
以下是在GT540M和Kepler K20c卡上的时间:
GT540M
float - Elapsed time: 74.1 ms
float2 - Elapsed time: 61.0 ms
float4 - Elapsed time: 56.1 ms
Kepler K20c
float - Elapsed time: 4.4 ms
float2 - Elapsed time: 3.3 ms
float4 - Elapsed time: 3.2 ms
可以看到,将数据加载为float4
是最快的方法。
以下是三个内核的反汇编代码(针对计算能力2.1
进行编译)。
add_float
Function : _Z9add_floatPfS_S_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]
/*0008*/ S2R R2, SR_TID.X
/*0010*/ SHL R2, R2, 0x2
/*0018*/ S2R R0, SR_CTAID.X
/*0020*/ SHL R0, R0, 0x2
/*0028*/ IMAD R0, R0, c[0x0][0x8], R2
/*0030*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT
/*0038*/ @P0 BRA.U 0xd8
/*0040*/ @!P0 ISCADD R2, R0, c[0x0][0x24], 0x2
/*0048*/ @!P0 ISCADD R10, R0, c[0x0][0x20], 0x2
/*0050*/ @!P0 ISCADD R0, R0, c[0x0][0x28], 0x2
/*0058*/ @!P0 LD R8, [R2]
/*0060*/ @!P0 LD R6, [R2+0x4]
/*0068*/ @!P0 LD R4, [R2+0x8]
/*0070*/ @!P0 LD R9, [R10]
/*0078*/ @!P0 LD R7, [R10+0x4]
/*0080*/ @!P0 LD R5, [R10+0x8]
/*0088*/ @!P0 LD R3, [R10+0xc]
/*0090*/ @!P0 LD R2, [R2+0xc]
/*0098*/ @!P0 FADD R8, R9, R8
/*00a0*/ @!P0 FADD R6, R7, R6
/*00a8*/ @!P0 FADD R4, R5, R4
/*00b0*/ @!P0 ST [R0], R8
/*00b8*/ @!P0 FADD R2, R3, R2
/*00c0*/ @!P0 ST [R0+0x4], R6
/*00c8*/ @!P0 ST [R0+0x8], R4
/*00d0*/ @!P0 ST [R0+0xc], R2
/*00d8*/ EXIT
add_float2
Function : _Z10add_float2P6float2S0_S0_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]
/*0008*/ S2R R2, SR_TID.X
/*0010*/ SHL R2, R2, 0x1
/*0018*/ S2R R0, SR_CTAID.X
/*0020*/ SHL R0, R0, 0x1
/*0028*/ IMAD R0, R0, c[0x0][0x8], R2
/*0030*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT
/*0038*/ @P0 BRA.U 0xa8
/*0040*/ @!P0 ISCADD R10, R0, c[0x0][0x20], 0x3
/*0048*/ @!P0 ISCADD R11, R0, c[0x0][0x24], 0x3
/*0050*/ @!P0 ISCADD R0, R0, c[0x0][0x28], 0x3
/*0058*/ @!P0 LD.64 R4, [R10]
/*0060*/ @!P0 LD.64 R8, [R11]
/*0068*/ @!P0 LD.64 R2, [R10+0x8]
/*0070*/ @!P0 LD.64 R6, [R11+0x8]
/*0078*/ @!P0 FADD R9, R5, R9
/*0080*/ @!P0 FADD R8, R4, R8
/*0088*/ @!P0 FADD R3, R3, R7
/*0090*/ @!P0 FADD R2, R2, R6
/*0098*/ @!P0 ST.64 [R0], R8
/*00a0*/ @!P0 ST.64 [R0+0x8], R2
/*00a8*/ EXIT
add_float4
Function : _Z10add_float4P6float4S0_S0_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]
/*0008*/ NOP
/*0010*/ MOV R3, c[0x0][0x2c]
/*0018*/ S2R R0, SR_CTAID.X
/*0020*/ SHR.U32 R3, R3, 0x2
/*0028*/ S2R R2, SR_TID.X
/*0030*/ IMAD R0, R0, c[0x0][0x8], R2
/*0038*/ ISETP.GE.U32.AND P0, PT, R0, R3, PT
/*0040*/ @P0 BRA.U 0x98
/*0048*/ @!P0 ISCADD R2, R0, c[0x0][0x20], 0x4
/*0050*/ @!P0 ISCADD R3, R0, c[0x0][0x24], 0x4
/*0058*/ @!P0 ISCADD R0, R0, c[0x0][0x28], 0x4
/*0060*/ @!P0 LD.128 R8, [R2]
/*0068*/ @!P0 LD.128 R4, [R3]
/*0070*/ @!P0 FADD R7, R11, R7
/*0078*/ @!P0 FADD R6, R10, R6
/*0080*/ @!P0 FADD R5, R9, R5
/*0088*/ @!P0 FADD R4, R8, R4
/*0090*/ @!P0 ST.128 [R0], R4
/*0098*/ EXIT
正如njuffa所提到的那样,可以看出在这三种情况下使用了不同的加载指令:LD
、LD.64
和LD.128
。
最后是代码:
#include <thrust/device_vector.h>
#define BLOCKSIZE 256
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void add_float(float *d_a, float *d_b, float *d_c, unsigned int N) {
const int tid = 4 * threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float a2 = d_a[tid+1];
float b2 = d_b[tid+1];
float a3 = d_a[tid+2];
float b3 = d_b[tid+2];
float a4 = d_a[tid+3];
float b4 = d_b[tid+3];
float c1 = a1 + b1;
float c2 = a2 + b2;
float c3 = a3 + b3;
float c4 = a4 + b4;
d_c[tid] = c1;
d_c[tid+1] = c2;
d_c[tid+2] = c3;
d_c[tid+3] = c4;
}
}
__global__ void add_float2(float2 *d_a, float2 *d_b, float2 *d_c, unsigned int N) {
const int tid = 2 * threadIdx.x + blockIdx.x * (2 * blockDim.x);
if (tid < N) {
float2 a1 = d_a[tid];
float2 b1 = d_b[tid];
float2 a2 = d_a[tid+1];
float2 b2 = d_b[tid+1];
float2 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
float2 c2;
c2.x = a2.x + b2.x;
c2.y = a2.y + b2.y;
d_c[tid] = c1;
d_c[tid+1] = c2;
}
}
__global__ void add_float4(float4 *d_a, float4 *d_b, float4 *d_c, unsigned int N) {
const int tid = 1 * threadIdx.x + blockIdx.x * (1 * blockDim.x);
if (tid < N/4) {
float4 a1 = d_a[tid];
float4 b1 = d_b[tid];
float4 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
c1.z = a1.z + b1.z;
c1.w = a1.w + b1.w;
d_c[tid] = c1;
}
}
int main() {
const int N = 4*10000000;
const float a = 3.f;
const float b = 5.f;
thrust::device_vector<float> d_A(N, a);
thrust::device_vector<float> d_B(N, b);
thrust::device_vector<float> d_C(N);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_A.data()), thrust::raw_pointer_cast(d_B.data()), thrust::raw_pointer_cast(d_C.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float = d_C;
for (int i=0; i<N; i++) {
if (h_float[i] != (a+b)) {
printf("Error for add_float at %i: result is %f\n",i, h_float[i]);
return -1;
}
}
thrust::device_vector<float> d_A2(N, a);
thrust::device_vector<float> d_B2(N, b);
thrust::device_vector<float> d_C2(N);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float2<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>((float2*)thrust::raw_pointer_cast(d_A2.data()), (float2*)thrust::raw_pointer_cast(d_B2.data()), (float2*)thrust::raw_pointer_cast(d_C2.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float2 = d_C2;
for (int i=0; i<N; i++) {
if (h_float2[i] != (a+b)) {
printf("Error for add_float2 at %i: result is %f\n",i, h_float2[i]);
return -1;
}
}
thrust::device_vector<float> d_A4(N, a);
thrust::device_vector<float> d_B4(N, b);
thrust::device_vector<float> d_C4(N);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float4<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>((float4*)thrust::raw_pointer_cast(d_A4.data()), (float4*)thrust::raw_pointer_cast(d_B4.data()), (float4*)thrust::raw_pointer_cast(d_C4.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float4 = d_C4;
for (int i=0; i<N; i++) {
if (h_float4[i] != (a+b)) {
printf("Error for add_float4 at %i: result is %f\n",i, h_float4[i]);
return -1;
}
}
return 0;
}
make_float4
有很多例子,而你发布的那个不是在那段代码中使用的。 - talonmiesvector_types.h
中的函数。通过使用适当的向量类型(例如,float4
),编译器可以创建指令以在单个事务中加载整个数量。在一定限度内,对于某些向量排列,这可以解决AoS / SoA问题。因此,是的,它可以更有效,具体取决于您要与之进行比较的内容。 - Robert Crovellafloat
、float2
和float4
数据类型(以及int
、int2
和int4
类型)。为了使加载指令正常工作,数据必须自然对齐,通常来说,较宽的数据负载提供更高的峰值内存带宽。因此,出于性能考虑,应该选择float4
而不是float3
。 - njuffahelper_math.h
中找到了一组构造函数。 - ilciavo