OpenCL
进行简单的矢量加法操作。我知道,Nvidia显卡速度更快,可以更好地完成任务。原则上,我可以在代码中放置一个if
语句,该语句将查找VENDOR
属性中的NVIDIA
。但我想要一些优雅的东西。在OpenCL C/C++
中编程选择更好(更快)的GPU的最佳方法是什么?OpenCL
进行简单的矢量加法操作。我知道,Nvidia显卡速度更快,可以更好地完成任务。原则上,我可以在代码中放置一个if
语句,该语句将查找VENDOR
属性中的NVIDIA
。但我想要一些优雅的东西。在OpenCL C/C++
中编程选择更好(更快)的GPU的最佳方法是什么?vsum = 30 + 30 + 3 = 63
。重新计算权重:w1,w2 = 0.5*(1/3) + 0.5*300/10/63 = 0.4
,w3 = 0.5*(1/3) + 0.5*300/100/63 = 0.2
。11 s, 11 s, and 55 s
。(1-alpha)
项占主导地位,直到最终权重全部基于该项。在这种情况下,权重分别变为47%(427像素),47%,6%(46像素),时间分别为14 s, 14 s, 14 s
。在这种情况下,仅使用CPU比仅使用GPU的结果提高了1秒。v
)= 距离/时间。在这种情况下,距离(d
)是要处理的像素数。然后总距离为:d = v1*t1 + v2*t2 + v3*t3
我们希望它们都在同一时间完成,因此
d = (v1 + v2 + v3)*t
v_i*t = w_i*d
这提供了
w_i = v_i*t/d
将 (d = (v1 + v2 + v3)*t
) 中的 (t/d
) 替换为以下内容:
w_i = v_i /(v1 + v2 + v3)
很容易看出,这可以推广到任意数量的设备 k
w_i = v_i/(v1 + v2 + ...v_k)
我的算法中,vsum
代表“速度总和”。最后,由于v_i
是像素随时间的变化量,因此它等于n_i/t_i
,从而得到
w_i = n_i/t_i/(n1/t1 + n2/t2 + ...n_k/t_k)
CPU iGPU dGPU-1 dGPU-2 oc
Intel Intel Nvidia Nvidia
1024 1024 1024 1024
34 ms 5ms 10ms 9ms
Intel Intel Nvidia Nvidia
512 1536 1024 1024
16 ms 8ms 10ms 9ms
Intel Intel Nvidia Nvidia
256 1792 1024 1024
9ms 10ms 10ms 9ms
或者直到您可以启用更细的粒度。
Intel Intel Nvidia Nvidia
320 1728 1024 1024
10ms 10ms 10ms 9ms
Intel Intel Nvidia Nvidia
320 1728 960 1088
10ms 10ms 10ms 10ms
^ ^
| |
| PCI-E bandwidth not more than 16 GB/s per device
closer to RAM, better bandwidth (20-40 GB/s) and less kernel overhead
不要仅仅获取平衡的最新迭代,您可以获取最近10个结果的平均值(或PID),以消除误导平衡的峰值。另外,缓冲区复制可能需要比计算更长的时间,如果将其包括在平衡中,则可以关闭不必要/无益的设备。
如果您创建一个库,那么您就不必为每个新项目尝试基准测试。当您加速矩阵乘法、流体运动、SQL表连接和财务近似值时,它们将在设备之间自动平衡。
对于平衡的解决方案:
如果您能解决一个线性系统作为n个未知数(每个设备的负载)和n个方程(所有设备的基准结果),则可以在单个步骤中找到目标负载。如果您选择迭代,则需要更多步骤直到收敛。后者并不比编写基准测试更难。前者对我来说更难,但随着时间的推移应该更有效率。
虽然仅有向量相加的核不是真实的世界场景,但这里是我的系统中的一个真实基准测试:
__kernel void bench(__global float * a, __global float *b, __global float *c)
{
int i=get_global_id(0);
c[i]=a[i]+b[i];
}
2560 768 768
AMD FX(tm)-8150 Eight-Core Processor Oland Pitcairn
经过数次循环,FX即使进行额外的缓冲区复制,也不使用任何主机指针仍然更快。即使PCI-E带宽相同,Oland GPU正在追赶Pitcairn。
现在加入一些三角函数:
__kernel void bench(__global float * a, __global float *b, __global float *c)
{
int i=get_global_id(0);
c[i]=sin(a[i])+cos(b[i])+sin(cos((float)i));
}
1792 1024 1280
测试GDDR3-128位与GDDR5-256位(超频)以及缓存。
__kernel void bench(__global float * a, __global float *b, __global float *c)
{
int i=get_global_id(0);
c[i]=a[i]+b[i]-a[i]-b[i]+a[i]+b[i]-b[i]-a[i]+b[i];
for(int j=0;j<12000;j++)
c[i]+=a[i]+b[i]-a[i]-b[i]+a[i]+b[i]-b[i]-a[i]+b[i];
}
256 256 3584
高计算与数据比:
__kernel void bench(__global float * a, __global float *b, __global float *c)
{
int i=get_global_id(0);
c[i]=0.0f; float c0=c[i];float a0=a[i];float b0=b[i];
for(int j=0;j<12000;j++)
c0+=sin(a0)+cos(b0*a0)+cos(sin(b0)*19.95f);
c[i]=c0;
}
256 2048 1792
好: 只需选择第一个兼容设备。在大多数系统上,只有一个可用。
更好: 你可以通过将CL_DEVICE_MAX_COMPUTE_UNITS设备信息结果乘以CL_DEVICE_MAX_CLOCK_FREQUENCY 设备信息结果来非常粗略地估算设备性能。根据你的工作负载,你可能需要包括其他指标,例如内存大小。你可以根据你的工作负载进行混合使用。
最佳: 在每个设备上使用你的确切工作流程进行基准测试。这确实是唯一的方法,因为其他任何方法都只是猜测。
最后,用户可能关心你正在使用哪些GPU,因此无论你选择哪种方法,你都应该有一些方式来覆盖程序选择。
请看下面GPU独显代码示例:
#include <iostream>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#pragma comment (lib, "x86_64/opencl.lib")
#endif
//OpenCL saxpy kernel used for benchmarking
const char* saxpy_kernel =
"__kernel \n"
"void saxpy_kernel(float alpha, \n"
" __global float *A, \n"
" __global float *B, \n"
" __global float *C) \n"
"{ \n"
" int idx = get_global_id(0); \n"
" C[idx] = alpha * A[idx] + B[idx]; \n"
"} ";
const char* clErrName[] = {
"CL_SUCCESS", //0
"CL_DEVICE_NOT_FOUND", //-1
"CL_DEVICE_NOT_AVAILABLE", //-2
"CL_COMPILER_NOT_AVAILABLE", //-3
"CL_MEM_OBJECT_ALLOCATION_FAILURE", //-4
"CL_OUT_OF_RESOURCES", //-5
"CL_OUT_OF_HOST_MEMORY", //-6
"CL_PROFILING_INFO_NOT_AVAILABLE", //-7
"CL_MEM_COPY_OVERLAP", //-8
"CL_IMAGE_FORMAT_MISMATCH", //-9
"CL_IMAGE_FORMAT_NOT_SUPPORTED", //-10
"CL_BUILD_PROGRAM_FAILURE", //-11
"CL_MAP_FAILURE", //-12
"CL_MISALIGNED_SUB_BUFFER_OFFSET", //-13
"CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST", //-14
"CL_COMPILE_PROGRAM_FAILURE", //-15
"CL_LINKER_NOT_AVAILABLE", //-16
"CL_LINK_PROGRAM_FAILURE", //-17
"CL_DEVICE_PARTITION_FAILED", //-18
"CL_KERNEL_ARG_INFO_NOT_AVAILABLE", //-19
"CL_UNDEFINED_ERROR_20", //-20
"CL_UNDEFINED_ERROR_21", //-21
"CL_UNDEFINED_ERROR_22", //-22
"CL_UNDEFINED_ERROR_23", //-23
"CL_UNDEFINED_ERROR_24", //-24
"CL_UNDEFINED_ERROR_25", //-25
"CL_UNDEFINED_ERROR_26", //-26
"CL_UNDEFINED_ERROR_27", //-27
"CL_UNDEFINED_ERROR_28", //-28
"CL_UNDEFINED_ERROR_29", //-29
"CL_INVALID_VALUE", //-30
"CL_INVALID_DEVICE_TYPE", //-31
"CL_INVALID_PLATFORM", //-32
"CL_INVALID_DEVICE", //-33
"CL_INVALID_CONTEXT", //-34
"CL_INVALID_QUEUE_PROPERTIES", //-35
"CL_INVALID_COMMAND_QUEUE", //-36
"CL_INVALID_HOST_PTR", //-37
"CL_INVALID_MEM_OBJECT", //-38
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", //-39
"CL_INVALID_IMAGE_SIZE", //-40
"CL_INVALID_SAMPLER", //-41
"CL_INVALID_BINARY", //-42
"CL_INVALID_BUILD_OPTIONS", //-43
"CL_INVALID_PROGRAM", //-44
"CL_INVALID_PROGRAM_EXECUTABLE", //-45
"CL_INVALID_KERNEL_NAME", //-46
"CL_INVALID_KERNEL_DEFINITION", //-47
"CL_INVALID_KERNEL", //-48
"CL_INVALID_ARG_INDEX", //-49
"CL_INVALID_ARG_VALUE", //-50
"CL_INVALID_ARG_SIZE", //-51
"CL_INVALID_KERNEL_ARGS", //-52
"CL_INVALID_WORK_DIMENSION", //-53
"CL_INVALID_WORK_GROUP_SIZE", //-54
"CL_INVALID_WORK_ITEM_SIZE", //-55
"CL_INVALID_GLOBAL_OFFSET", //-56
"CL_INVALID_EVENT_WAIT_LIST", //-57
"CL_INVALID_EVENT", //-58
"CL_INVALID_OPERATION", //-59
"CL_INVALID_GL_OBJECT", //-60
"CL_INVALID_BUFFER_SIZE", //-61
"CL_INVALID_MIP_LEVEL", //-62
"CL_INVALID_GLOBAL_WORK_SIZE", //-63
"CL_INVALID_PROPERTY", //-64
"CL_INVALID_IMAGE_DESCRIPTOR", //-65
"CL_INVALID_COMPILER_OPTIONS", //-66
"CL_INVALID_LINKER_OPTIONS", //-67
"CL_INVALID_DEVICE_PARTITION_COUNT", //-68
"CL_INVALID_PIPE_SIZE", //-69
"CL_INVALID_DEVICE_QUEUE", //-70
};
const int MAX_ERR_CODE = 70;
inline bool __clCallSuccess(cl_int err_code, const char* source_file, const int source_line)
{
if (err_code == CL_SUCCESS)
return true;
if ((err_code > 0) || (err_code < -MAX_ERR_CODE))
std::clog << "\t - unknown CL error: " << err_code;
else
std::clog << "\t - CL call error: " << clErrName[-err_code];
std::clog << " [" << source_file << " : " << source_line << "]" << std::endl;
return false;
}
#define clCallSuccess(err_code) __clCallSuccess(err_code, __FILE__, __LINE__)
float cl_BenchmarkDevice(cl_context context, cl_command_queue command_queue, cl_device_id device_id)
{
float microSeconds = -1.;
int i;
cl_int clStatus;
const int VECTOR_SIZE = 512 * 1024;
// Allocate space for vectors A, B and C
float* A = (float*)malloc(sizeof(float) * VECTOR_SIZE); if(A) {
float* B = (float*)malloc(sizeof(float) * VECTOR_SIZE); if(B) {
float* C = (float*)malloc(sizeof(float) * VECTOR_SIZE); if(C) {
for (i = 0; i < VECTOR_SIZE; i++)
{
A[i] = (float)i;
B[i] = (float)(VECTOR_SIZE - i);
C[i] = 0;
}
// Create memory buffers on the device for each vector
cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); if (clCallSuccess(clStatus)) {
cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); if (clCallSuccess(clStatus)) {
cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); if (clCallSuccess(clStatus)) {
// Copy the Buffer A and B to the device
clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL); if (clCallSuccess(clStatus)) {
clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL); if (clCallSuccess(clStatus)) {
// Create a program from the kernel source and build it
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&saxpy_kernel, NULL, &clStatus); if (clCallSuccess(clStatus) && program) {
clStatus = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (clCallSuccess(clStatus)) {
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus); if (clCallSuccess(clStatus) && kernel) {
float alpha = 2.5;
// Set the arguments of the kernel
clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void*)&alpha); if (clCallSuccess(clStatus)) {
clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&A_clmem); if (clCallSuccess(clStatus)) {
clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&B_clmem); if (clCallSuccess(clStatus)) {
clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&C_clmem); if (clCallSuccess(clStatus)) {
// Execute the OpenCL kernel on the list
cl_event event;
size_t global_size = VECTOR_SIZE; // Process the entire lists
size_t local_size = 512; // Process one item at a time
//clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event);
clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &event); if (clCallSuccess(clStatus)) {
clStatus = clWaitForEvents(1, &event); if (clCallSuccess(clStatus)) {
//measure duration
cl_ulong time_start;
cl_ulong time_end;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
microSeconds = (float)(time_end - time_start) / 1000.0f;
std::clog << "\nOpenCl benchmarking time: " << microSeconds << " microseconds \n";
std::clog << "\n\t*****************************\n\n";
}
// Read the cl memory C_clmem on device to the host variable C
clCallSuccess(clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL));
// Clean up and wait for all the comands to complete.
clCallSuccess(clFlush(command_queue));
clCallSuccess(clFinish(command_queue));
} //Kernel
}}}} //SetKErnelArg
// Finally release all OpenCL allocated objects and host buffers.
clCallSuccess(clReleaseKernel(kernel)); }
} //BuildProgram
clCallSuccess(clReleaseProgram(program)); }
} } //EnqueueWriteBuffer
clCallSuccess(clReleaseMemObject(C_clmem)); }
clCallSuccess(clReleaseMemObject(B_clmem)); }
clCallSuccess(clReleaseMemObject(A_clmem)); }
free(C); }
free(B); }
free(A); }
return microSeconds;
}
/*
struct _dev_info {
cl_platform_id platfID;
cl_device_id devID;
};
typedef struct _dev_info dev_info;
*/
cl_device_id cl_GetBestDevice(void)
{
cl_int err;
cl_uint numPlatforms, numDevices;
cl_platform_id platfIDs[10];
cl_device_id devIDsAll[10];
int countGPUs = 0;
cl_device_id best_device = NULL;
float best_perf = 100000000.;
if (clCallSuccess(clGetPlatformIDs(10, platfIDs, &numPlatforms)))
{
std::clog << "OpenCL platforms detected: " << numPlatforms << std::endl;
for (unsigned int i = 0; i < numPlatforms; i++)
{
std::clog << "PlatformInfo for platform no." << (i + 1) << std::endl;
const int SZ_INFO = 1024;
char info[SZ_INFO];
size_t sz;
if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_NAME, SZ_INFO, info, &sz)))
std::clog << " - - Name: " << info << std::endl;
if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_VENDOR, SZ_INFO, info, &sz)))
std::clog << " - - Vendor: " << info << std::endl;
if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_PROFILE, SZ_INFO, info, &sz)))
std::clog << " - - Profile: " << info << std::endl;
if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_VERSION, SZ_INFO, info, &sz)))
std::clog << " - - Version: " << info << std::endl;
if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_EXTENSIONS, SZ_INFO, info, &sz)))
std::clog << " - - Extensions: " << info << std::endl;
if (clCallSuccess(clGetDeviceIDs(platfIDs[i], CL_DEVICE_TYPE_ALL, 10, devIDsAll, &numDevices)))
{
cl_context_properties cProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platfIDs[i]), 0 };
cl_command_queue_properties qProperties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 };
for (unsigned int ii = 0; ii < numDevices; ii++)
{
cl_uint val;
cl_ulong memsz;
cl_device_type dt;
size_t mws;
std::clog << " >> DeviceInfo for device no." << (ii + 1) << std::endl;
if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_NAME, SZ_INFO, info, &sz)))
std::clog << "\t - Name: " << info << std::endl;
if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_VENDOR, SZ_INFO, info, &sz)))
std::clog << "\t - Vendor: " << info << std::endl;
if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_VERSION, SZ_INFO, info, &sz)))
std::clog << "\t - Version: " << info << std::endl;
if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_TYPE, sizeof(dt), &dt, &sz)))
{
std::clog << "\t - Type: ";
switch (dt)
{
case CL_DEVICE_TYPE_CPU: std::clog << "CPU"; break;
case CL_DEVICE_TYPE_GPU: std::clog << "GPU"; break;
case CL_DEVICE_TYPE_ACCELERATOR: std::clog << "Accelerator"; break;
case CL_DEVICE_TYPE_DEFAULT: std::clog << "Default"; break;
default: std::clog << "ERROR";
}
std::clog << std::endl;
}
if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(memsz), &memsz, &sz)))
std::clog << "\t - Memory: " << (memsz / 1024 / 1024) << " MB" << std::endl;
if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(val), &val, &sz)))
std::clog << "\t - Max Frequency: " << val << " MHz" << std::endl;
if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(val), &val, &sz)))
std::clog << "\t - Compute units: " << val << std::endl;
if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(mws), &mws, &sz)))
std::clog << "\t - Max workgroup size: " << mws << std::endl;
// Create an OpenCL context
cl_context context = clCreateContext(NULL, 1, devIDsAll+ii, NULL, NULL, &err);
if (clCallSuccess(err) && context)
{
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, devIDsAll[ii], qProperties, &err);
if (clCallSuccess(err) && command_queue)
{
float perf = cl_BenchmarkDevice(context, command_queue, devIDsAll[ii]);
if ((perf > 0) && (perf < best_perf))
{
best_perf = perf;
best_device = devIDsAll[ii];
}
clCallSuccess(clReleaseCommandQueue(command_queue));
}
clCallSuccess(clReleaseContext(context));
}
}
}
}
}
return best_device;
}