为回答您的问题,我考虑了四种不同的内核,其中每个线程对
n_loop
次迭代执行一个
for
循环。这四个内核实现了四种可能的情况:
- 编译时已知迭代次数
n_loop
;
- 编译时已知迭代次数
n_loop
,且求和是有条件的;
- 运行时已知迭代次数
n_loop
;
- 运行时已知迭代次数
n_loop
,并进行手动循环展开。
完整代码如下:
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE 512
#define epsilon 0.5
#define n_loop 8
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel1(float* input, float* output, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float accum = 0.f;
for (int i = 0; i < n_loop; i++) {
accum = accum + input[n_loop*tid+i];
}
output[tid] = accum;
}
}
__global__ void testKernel2(float* input, float* output, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float accum = 0.f;
for (int i = 0; i < n_loop; i++) if (input[n_loop*tid+i] < epsilon) accum = accum + input[n_loop*tid+i];
output[tid] = accum;
}
}
__global__ void testKernel3(float* input, float* output, int N_loop, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float accum = 0.f;
for (int i = 0; i < N_loop; i++) accum = accum + input[N_loop*tid+i];
output[tid] = accum;
}
}
__global__ void testKernel4(float* input, float* output, int N_loop, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float accum1 = 0.f;
float accum2 = 0.f;
float accum3 = 0.f;
float accum4 = 0.f;
for (int i = 0; i < N_loop/4; i++) {
accum1 = accum1 + input[N_loop*tid+i];
accum2 = accum2 + input[N_loop*tid+i+N_loop/4];
accum3 = accum3 + input[N_loop*tid+i+2*N_loop/4];
accum4 = accum4 + input[N_loop*tid+i+3*N_loop/4];
}
output[tid] = accum1 + accum2 + accum3 + accum4;
}
}
int main() {
const int N = 512*512*32;
float* input = (float*) malloc(n_loop*N*sizeof(float));
float* output = (float*) malloc(N*sizeof(float));
float* output2 = (float*) malloc(N*sizeof(float));
float* outputif = (float*) malloc(N*sizeof(float));
float* d_input; gpuErrchk(cudaMalloc((void**)&d_input, n_loop*N*sizeof(float)));
float* d_output; gpuErrchk(cudaMalloc((void**)&d_output, N*sizeof(float)));
srand(time(NULL));
for (int i=0; i<n_loop*N; i++) input[i] = rand() / (float)RAND_MAX;
gpuErrchk(cudaMemcpy(d_input, input, n_loop*N*sizeof(float), cudaMemcpyHostToDevice));
for (int k = 0; k < N; k++) {
float accum1 = 0.f;
float accum2 = 0.f;
for (int i = 0; i < n_loop; i++) {
accum1 = accum1 + input[n_loop*k+i];
if (input[n_loop*k+i] < epsilon) accum2 = accum2 + input[n_loop*k+i];
}
output[k] = accum1;
outputif[k] = accum2;
}
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
testKernel1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel1 elapsed time: %3.4f ms \n", time);
gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++)
if (output[i] != output2[i]) {
printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
return 1;
}
printf("kernel1: results match!\n");
cudaEventRecord(start, 0);
testKernel2<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel1 elapsed time: %3.4f ms \n", time);
gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++)
if (outputif[i] != output2[i]) {
printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, outputif[i], output2[i]);
return 1;
}
printf("kernel2: results match!\n");
cudaEventRecord(start, 0);
testKernel3<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel3 elapsed time: %3.4f ms \n", time);
gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++)
if (output[i] != output2[i]) {
printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
return 1;
}
printf("kernel3: results match!\n");
cudaEventRecord(start, 0);
testKernel4<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel4 elapsed time: %3.4f ms \n", time);
gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++)
if (abs(output[i] - output2[i]) > 0.0001) {
printf("Mismatch at i = %d, Host= %f, Device = %f, difference = %f\n", i, output[i], output2[i], output2[i] - output[i]);
return 1;
}
printf("kernel4: results match!\n");
return 0;
}
现在让我们分析四种不同情况的反汇编代码(使用CUDA 6.0编译)。我考虑针对Fermi架构进行编译。
内核1
MOV R1, c[0x1][0x100]
S2R R0, SR_CTAID.X
IMUL R2, R0, c[0x0][0x8]
S2R R3, SR_TID.X
IADD R0, R2, R3
ISETP.GE.AND P0, PT, R0, c[0x0][0x28], PT
@P0 BRA.U 0xd8
@!P0 IADD R2, R3, R2
@!P0 ISCADD R2, R2, c[0x0][0x20], 0x5
@!P0 ISCADD R0, R0, c[0x0][0x24], 0x2
@!P0 LD R9, [R2]
@!P0 LD R8, [R2+0x4]
@!P0 LD R7, [R2+0x8]
@!P0 LD R6, [R2+0xc]
@!P0 LD R5, [R2+0x10]
@!P0 LD R4, [R2+0x14]
@!P0 LD R3, [R2+0x18]
@!P0 LD R2, [R2+0x1c]
@!P0 F2F.F32.F32 R9, R9
@!P0 FADD R8, R9, R8
@!P0 FADD R7, R8, R7
@!P0 FADD R6, R7, R6
@!P0 FADD R5, R6, R5
@!P0 FADD R4, R5, R4
@!P0 FADD R3, R4, R3
@!P0 FADD R2, R3, R2
@!P0 ST [R0], R2
EXIT
在这种情况下,编译器完全展开了循环。你会看到8个不同的加载(LD)指令和7个不同的加法(FADD)指令。
第二个内核。
MOV R1, c[0x1][0x100]
S2R R0, SR_CTAID.X
IMUL R0, R0, c[0x0][0x8]
S2R R2, SR_TID.X
IADD R3, R0, R2
ISETP.GE.AND P0, PT, R3, c[0x0][0x28], PT
@P0 EXIT
IADD R0, R2, R0
ISCADD R9, R0, c[0x0][0x20], 0x5
LD R0, [R9]
LD R2, [R9+0x4]
LD R4, [R9+0x8]
LD R5, [R9+0xc]
LD R6, [R9+0x10]
LD R7, [R9+0x14]
LD R8, [R9+0x18]
LD R9, [R9+0x1c]
FSETP.LT.AND P0, PT, R0, 0.5, PT
FSETP.LT.AND P1, PT, R4, 0.5, PT
F2F.F32.F32 R0, R0
SEL R0, R0, RZ, P0
FSETP.LT.AND P0, PT, R2, 0.5, PT
@P0 FADD R0, R0, R2
FSETP.LT.AND P0, PT, R5, 0.5, PT
@P1 FADD R0, R0, R4
@P0 FADD R0, R0, R5
FSETP.LT.AND P1, PT, R8, 0.5, PT
FSETP.LT.AND P0, PT, R6, 0.5, PT
FADD R2, R0, R6
SEL R2, R2, R0, P0
FSETP.LT.AND P0, PT, R7, 0.5, PT
ISCADD R0, R3, c[0x0][0x24], 0x2
@P0 FADD R2, R2, R7
FSETP.LT.AND P0, PT, R9, 0.5, PT
@P1 FADD R2, R2, R8
@P0 FADD R2, R2, R9
ST [R0], R2
EXIT
同样的情况下,编译器完全展开了循环。您将再次看到8个不同的加载(LD
)指令和7个不同的加法(FADD
)指令。
内核3
c[0x0][0x30] = N
c[0x1][0x100] = BLOCKSIZE
c[0x0][0x8] = blockDim.x
c[0x0][0x30] = N_loop
c[0x0][0x20] = input
/*0000*/ MOV R1, c[0x1][0x100]
/*0008*/ S2R R0, SR_CTAID.X
/*0010*/ S2R R2, SR_TID.X
/*0018*/ IMAD R0, R0, c[0x0][0x8], R2
/*0020*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x34], PT
/*0028*/ @P0 EXIT
/*0030*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x30], PT
/*0038*/ @P0 BRA 0x60
/*0040*/ MOV R4, RZ
/*0048*/ BRA 0x170
/*0050*/ NOP
/*0058*/ NOP
/*0060*/ MOV R2, c[0x0][0x30]
/*0068*/ IMUL R3, R0, c[0x0][0x30]
/*0070*/ MOV32I R6, 0x4
/*0078*/ ISETP.GT.AND P0, PT, R2, 0x3, PT
/*0080*/ IMAD R2.CC, R3, R6, c[0x0][0x20]
/*0088*/ MOV R4, RZ
/*0090*/ MOV R5, RZ
/*0098*/ IMAD.HI.X R3, R3, R6, c[0x0][0x24]
/*00a0*/ @!P0 BRA 0x128
/*00a8*/ MOV R6, c[0x0][0x30]
/*00b0*/ IADD R10, R6, -0x3
/*00b8*/ NOP
/*00c0*/ IADD R5, R5, 0x4
/*00c8*/ LD.E R6, [R2]
/*00d0*/ ISETP.LT.AND P0, PT, R5, R10, PT
/*00d8*/ LD.E R7, [R2+0x4]
/*00e0*/ LD.E R8, [R2+0x8]
/*00e8*/ LD.E R9, [R2+0xc]
/*00f0*/ IADD R2.CC, R2, 0x10
/*00f8*/ IADD.X R3, R3, RZ
/*0100*/ FADD R6, R4, R6
/*0108*/ FADD R4, R6, R7
/*0110*/ FADD R8, R4, R8
/*0118*/ FADD R4, R8, R9
/*0120*/ @P0 BRA 0xc0
/*0128*/ ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT
/*0130*/ @!P0 BRA 0x170
/*0138*/ IADD R5, R5, 0x1
/*0140*/ LD.E R6, [R2]
/*0148*/ ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT
/*0150*/ IADD R2.CC, R2, 0x4
/*0158*/ IADD.X R3, R3, RZ
/*0160*/ FADD R4, R4, R6
/*0168*/ @P0 BRA 0x138
/*0170*/ MOV32I R3, 0x4
/*0178*/ IMAD R2.CC, R0, R3, c[0x0][0x28]
/*0180*/ IMAD.HI.X R3, R0, R3, c[0x0][0x2c]
/*0188*/ ST.E [R2], R4
/*0190*/ EXIT
如图所示,编译器会自动对4
进行循环展开,因为可以看到有4
个加载操作(LD
)和3
个不同的加法操作(FADD
)。
内核4
/*0000*/ MOV R1, c[0x1][0x100]
/*0008*/ S2R R0, SR_CTAID.X
/*0010*/ S2R R2, SR_TID.X
/*0018*/ IMAD R13, R0, c[0x0][0x8], R2
/*0020*/ ISETP.GE.AND P0, PT, R13, c[0x0][0x34], PT
/*0028*/ @P0 EXIT
/*0030*/ MOV R2, c[0x0][0x30]
/*0038*/ SHR R0, R2, 0x1f
/*0040*/ ISETP.GT.AND P0, PT, R2, 0x3, PT
/*0048*/ IMAD.U32.U32.HI R0, R0, 0x4, R2
/*0050*/ SHR R0, R0, 0x2
/*0058*/ @P0 BRA 0x98
/*0060*/ MOV R18, RZ
/*0068*/ MOV R19, RZ
/*0070*/ MOV R10, RZ
/*0078*/ MOV R11, RZ
/*0080*/ BRA 0x308
/*0088*/ NOP
/*0090*/ NOP
/*0098*/ MOV R3, c[0x0][0x30]
/*00a0*/ IMUL R4, R13, c[0x0][0x30]
/*00a8*/ MOV32I R5, 0x4
/*00b0*/ IMUL R2, R3, 0x3
/*00b8*/ SHL R6, R3, 0x1
/*00c0*/ IADD R10, R0, R4
/*00c8*/ SHR R3, R2, 0x1f
/*00d0*/ IMAD R8.CC, R4, R5, c[0x0][0x20]
/*00d8*/ SHR R7, R6, 0x1f
/*00e0*/ IMAD.U32.U32.HI R2, R3, 0x4, R2
/*00e8*/ IMAD.HI.X R9, R4, R5, c[0x0][0x24]
/*00f0*/ IMAD.U32.U32.HI R7, R7, 0x4, R6
/*00f8*/ IMAD.HI R3, R2, c[0x10][0x0], R4
/*0100*/ IMAD R6.CC, R10, R5, c[0x0][0x20]
/*0108*/ ISETP.GT.AND P0, PT, R0, 0x1, PT
/*0110*/ IMAD.HI R14, R7, c[0x10][0x0], R4
/*0118*/ MOV R18, RZ
/*0120*/ IMAD.HI.X R7, R10, R5, c[0x0][0x24]
/*0128*/ MOV R19, RZ
/*0130*/ IMAD R2.CC, R3, R5, c[0x0][0x20]
/*0138*/ MOV R10, RZ
/*0140*/ IMAD.HI.X R3, R3, R5, c[0x0][0x24]
/*0148*/ MOV R11, RZ
/*0150*/ IMAD R4.CC, R14, R5, c[0x0][0x20]
/*0158*/ MOV R12, RZ
/*0160*/ IMAD.HI.X R5, R14, R5, c[0x0][0x24]
/*0168*/ @!P0 BRA 0x260
/*0170*/ IADD R16, R0, -0x1
/*0178*/ NOP
/*0180*/ IADD R12, R12, 0x2
/*0188*/ LD.E R15, [R8]
/*0190*/ ISETP.LT.AND P0, PT, R12, R16, PT
/*0198*/ LD.E R20, [R6]
/*01a0*/ FADD R17, R18, R15
/*01a8*/ LD.E R14, [R4]
/*01b0*/ FADD R19, R19, R20
/*01b8*/ LD.E R15, [R2]
/*01c0*/ LD.E R18, [R8+0x4]
/*01c8*/ LD.E R20, [R6+0x4]
/*01d0*/ IADD R6.CC, R6, 0x8
/*01d8*/ NOP
/*01e0*/ FADD R14, R10, R14
/*01e8*/ FADD R15, R11, R15
/*01f0*/ IADD.X R7, R7, RZ
/*01f8*/ LD.E R10, [R4+0x4]
/*0200*/ IADD R4.CC, R4, 0x8
/*0208*/ LD.E R11, [R2+0x4]
/*0210*/ IADD.X R5, R5, RZ
/*0218*/ FADD R18, R17, R18
/*0220*/ IADD R2.CC, R2, 0x8
/*0228*/ FADD R19, R19, R20
/*0230*/ IADD.X R3, R3, RZ
/*0238*/ IADD R8.CC, R8, 0x8
/*0240*/ IADD.X R9, R9, RZ
/*0248*/ FADD R10, R14, R10
/*0250*/ FADD R11, R15, R11
/*0258*/ @P0 BRA 0x180
/*0260*/ ISETP.LT.AND P0, PT, R12, R0, PT
/*0268*/ @!P0 BRA 0x308
/*0270*/ IADD R12, R12, 0x1
/*0278*/ LD.E R17, [R8]
/*0280*/ ISETP.LT.AND P0, PT, R12, R0, PT
/*0288*/ LD.E R16, [R6]
/*0290*/ IADD R6.CC, R6, 0x4
/*0298*/ LD.E R15, [R4]
/*02a0*/ IADD.X R7, R7, RZ
/*02a8*/ LD.E R14, [R2]
/*02b0*/ IADD R4.CC, R4, 0x4
/*02b8*/ IADD.X R5, R5, RZ
/*02c0*/ IADD R2.CC, R2, 0x4
/*02c8*/ IADD.X R3, R3, RZ
/*02d0*/ IADD R8.CC, R8, 0x4
/*02d8*/ IADD.X R9, R9, RZ
/*02e0*/ FADD R18, R18, R17
/*02e8*/ FADD R19, R19, R16
/*02f0*/ FADD R10, R10, R15
/*02f8*/ FADD R11, R11, R14
/*0300*/ @P0 BRA 0x270
/*0308*/ FADD R0, R18, R19
/*0310*/ MOV32I R3, 0x4
/*0318*/ FADD R0, R0, R10
/*0320*/ IMAD R2.CC, R13, R3, c[0x0][0x28]
/*0328*/ FADD R0, R0, R11
/*0330*/ IMAD.HI.X R3, R13, R3, c[0x0][0x2c]
/*0338*/ ST.E [R2], R0
/*0340*/ EXIT
在这种情况下,编译器会自动执行循环展开
4
,这与手动循环展开
4
重叠。因此我看到了
8
次加载操作(
LD
)和
7
次不同的加法(
FADD
)。
尽管反汇编代码与Fermi架构不同,但编译器行为在Kepler架构中也是相似的。
由于自动循环展开能力,不同内核之间的性能差异不大。
GT 210 (c.c. 1.2)
Kernel 1 = 111ms
Kernel 2 = 108ms
Kernel 3 = 107ms
Kernel 4 = 110ms
Kepler K20c (c.c. 3.5)
Kernel 1 = 1.8ms
Kernel 2 = 1.8ms
Kernel 3 = 1.8ms
Kernel 4 = 1.8ms
我没有明确提供 Fermi 架构的结果,但是四个考虑的内核的时间大约相同。