我注意到一种奇怪的现象,可以使用三角括号符号在CUDA中启动宿主函数。为了测试这个现象,我编写了一个简单的内核,可以在两个整数数组之间复制数据。请注意,我正在Tesla K40上运行所有这些代码,并使用-gencode arch=compute_35,code=sm_35进行编译:
#ifndef HOST_LAUNCH_H
#define HOST_LAUNCH_H
using namespace std;
// Assumes input and output are both length 32
__global__ void CopyKernel(const int* input, int* output) {
size_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
output[global_idx] = input[global_idx];
}
__host__ void Copy(const int* input, int* output) {
int* d_input = 0;
int* d_output = 0;
cudaMalloc((void**)&d_input, 32 * sizeof(int));
cudaMalloc((void**)&d_output, 32 * sizeof(int));
cudaMemcpy(d_input, input, 32 * sizeof(int), cudaMemcpyHostToDevice);
CopyKernel<<<1,32>>>(d_input, d_output);
cudaMemcpy(output, d_output, 32 * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_output);
}
#endif
接下来,我编写了以下单元测试:
#include "host_launch.h"
#include <assert.h>
using namespace std;
__host__ void TestKernelLaunch() {
int input[32];
int output[32];
for(int i = 0; i < 32; i++) {
input[i] = i;
output[i] = 0;
}
int* d_input = 0;
int* d_output = 0;
cudaMalloc((void**)&d_input, 32 * sizeof(int));
cudaMalloc((void**)&d_output, 32 * sizeof(int));
cudaMemcpy(d_input, input, 32 * sizeof(int), cudaMemcpyHostToDevice);
for(int i = 0; i < 32; i++) {
assert(output[i] == 0);
}
CopyKernel<<<1,32>>>(d_input, d_output);
cudaMemcpy(output, d_output, 32 * sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0; i < 32; i++) {
assert(output[i] == i);
}
cudaFree(d_input);
cudaFree(d_output);
}
__host__ void TestHostLaunch() {
int input[32];
int output[32];
for(int i = 0; i < 32; i++) {
input[i] = i + 1;
output[i] = 0;
}
int* d_input = 0;
int* d_output = 0;
cudaMalloc((void**)&d_input, 32 * sizeof(int));
cudaMalloc((void**)&d_output, 32 * sizeof(int));
cudaMemcpy(d_input, input, 32 * sizeof(int), cudaMemcpyHostToDevice);
for(int i = 0; i < 32; i++) {
assert(output[i] == 0);
}
//Copy<<<1,32>>>(d_input, d_output);
cudaMemcpy(output, d_output, 32 * sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0; i < 32; i++) {
assert(output[i] == i + 1);
}
cudaFree(d_input);
cudaFree(d_output);
}
__host__ void TestFunctionPointerLaunch(void (*f)(const int*, int*)) {
int input[32];
int output[32];
for(int i = 0; i < 32; i++) {
input[i] = i + 2;
output[i] = 0;
}
int* d_input = 0;
int* d_output = 0;
cudaMalloc((void**)&d_input, 32 * sizeof(int));
cudaMalloc((void**)&d_output, 32 * sizeof(int));
cudaMemcpy(d_input, input, 32 * sizeof(int), cudaMemcpyHostToDevice);
for(int i = 0; i < 32; i++) {
assert(output[i] == 0);
}
f<<<1,32>>>(d_input, d_output);
cudaMemcpy(output, d_output, 32 * sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0; i < 32; i++) {
assert(output[i] == i + 2);
}
cudaFree(d_input);
cudaFree(d_output);
}
int main() {
TestKernelLaunch();
TestFunctionPointerLaunch(CopyKernel);
TestFunctionPointerLaunch(Copy);
}
如果我取消该行的注释:
//Copy<<<1,32>>>(d_input, d_output);
I get:
host_launch_unittest.cu(49): error: a host function call cannot be configured
但是等价的操作可以使用以下方式进行:
f<<<1,32>>>(d_input, d_output);
在TestFunctionPointerLaunch函数中,它通过了所有断言。我只是想知道GPU在底层做了什么使得这个主机函数的启动行为正确。我编写这些测试来隔离这种行为,但也发现它适用于更复杂的内核/主机函数。此外,我决定计时这些操作,以查看它们是否被编译为等效操作:
#include "host_launch.h"
#include <iostream>
#include <assert.h>
using namespace std;
__host__ float MeanCopyTime(const int copy_count, void (*f)(const int*, int*)) {
int input[32 * copy_count];
int output[32 * copy_count];
for(int i = 0; i < 32 * copy_count; i++) {
input[i] = i;
output[i] = 0;
}
int* d_input = 0;
int* d_output = 0;
cudaMalloc((void**)&d_input, 32 * copy_count * sizeof(int));
cudaMalloc((void**)&d_output, 32 * copy_count * sizeof(int));
cudaMemcpy(d_input, input, 32 * copy_count * sizeof(int), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for(int i = 0; i < copy_count; i++)
f<<<1,32>>>(d_input + i * 32, d_output + i * 32);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float msecs = 0;
cudaEventElapsedTime(&msecs, start, stop);
cudaMemcpy(output, d_output, 32 * copy_count * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_output);
for(int i = 0; i < 32 * copy_count; i++) {
assert(output[i] == i);
}
return msecs / copy_count;
}
int main() {
int copy_count = 10000;
cout << endl;
cout << "Average Kernel Launch Time: " << MeanCopyTime(copy_count, CopyKernel) << endl;
cout << "Average Host Function Launch Time: " << MeanCopyTime(copy_count, Copy) << endl;
cout << endl;
}
对于我的架构,这将返回:
Average Kernel Launch Time: 0.00420756
Average Host Function Launch Time: 0.169097
再次,如果您对这里发生的事情有任何想法,我们将不胜感激。
这段话涉及IT技术方面的内容,需要更具体的上下文才能进行精确翻译。