不幸的是,在使用SYCL为数值核心实现GPU加速的过程中,我发现只是从`sycl::malloc_device()`切换到`sycl::malloc_shared()`就导致性能下降了最多4000% - 即使我只是重复提交相同的SYCL核心,而没有尝试从主机访问数据。
使用OpenSYCL针对AMD HIP GFX906(Radeon VII / Instinct MI50)构建代码,并使用`sycl::malloc_device()`,程序在0.27秒内完成。
$ time ./fdtd.elf
simulate 16974593 cells for 10 timesteps.
real 0m0.271s
user 0m0.253s
sys 0m0.020s
使用sycl::malloc_shared()
构建相同的代码时,程序完成需要10.6秒:
simulate 16974593 cells for 10 timesteps.
real 0m10.649s
user 0m15.172s
sys 0m0.196s
这是一个3925%的减速。
在BIOS中启用了"超过4G解码"和"可调整大小的BAR"支持后,现在只需要3.8秒而不是10.6秒。但这并未解决无谓的内存传输的实际问题 - 1300%的性能损失仍然相当显著。
我之前还使用英特尔DPC++编译器测试了类似的内核,在相同的硬件上看到了类似的结果。
我怀疑减速是由于无谓的主机和设备拷贝引起的,但我不确定。SYCL运行时使用什么启发式算法来确定是否需要拷贝呢?
下面是示例代码。
ArrayNXYZ.hpp:4维数组(n,x,y,z)的包装类。
#include <sycl/sycl.hpp>
template <typename T>
struct ArrayXYZN
{
ArrayXYZN() {}
inline T& operator() (const unsigned int n, const unsigned int x, const unsigned int y, const unsigned int z) const
{
size_t offset = n * n_stride + x * x_stride + y * y_stride + z;
return array[offset];
}
unsigned long n_stride, x_stride, y_stride, size;
T *array;
};
template <typename T>
ArrayXYZN<T>* CreateArrayXYZN(sycl::queue Q, const unsigned int* numLines)
{
unsigned int n_max = 3;
unsigned int x_max = numLines[0];
unsigned int y_max = numLines[1];
unsigned int z_max = numLines[2];
unsigned long n_stride = x_max * y_max * z_max;
unsigned long x_stride = y_max * z_max;
unsigned long y_stride = z_max;
if (n_stride % 128 != 0)
{
n_stride += 128 - (n_stride % 128);
}
// allocate 1D linear buffer
size_t size = n_stride * n_max;
#ifdef USM
T *buf = sycl::malloc_shared<T>(size, Q);
#else
T *buf = sycl::malloc_device<T>(size, Q);
#endif
// zero memory
Q.submit([&](sycl::handler& h) {
h.memset(buf, 0, size * sizeof(T));
});
Q.wait();
// allocate wrapper class
ArrayXYZN<T>* array = new ArrayXYZN<T>();
array->n_stride = n_stride;
array->x_stride = x_stride;
array->y_stride = y_stride;
array->size = size * sizeof(T);
array->array = buf;
return array;
}
fdtd.cpp:
#include <sycl/sycl.hpp>
#include "ArrayNXYZ.hpp"
/*
* UpdateVoltages
*
* Using Finite Difference Time Domain (FDTD) method,
* calculate new electric field array "volt" based on
* magnetic field "curr" and two electromagnetic field
* operators "vv" and "vi", precalculated from the
* physical materials before starting up simulation.
*/
void UpdateVoltages(
const ArrayXYZN<float>& volt,
const ArrayXYZN<float>& curr,
const ArrayXYZN<float>& vv,
const ArrayXYZN<float>& vi,
int x, int y, int z
)
{
// note: each (x, y, z) cell has three polarizations
// x, y, z, these are different from the cell's
// coordinates (x, y, z)
//for x polarization
float volt0 = volt(0, x, y, z);
volt0 *= vv(0, x, y, z);
volt0 +=
vi(0, x, y, z) * (
curr(2, x, y , z ) -
curr(2, x, y-1, z ) -
curr(1, x, y , z ) +
curr(1, x, y , z-1)
);
//for y polarization
float volt1 = volt(1, x, y, z);
volt1 *= vv(1, x, y, z);
volt1 +=
vi(1, x, y, z) * (
curr(0, x , y, z ) -
curr(0, x , y, z-1) -
curr(2, x , y, z ) +
curr(2, x-1, y, z )
);
//for z polarization
float volt2 = volt(2, x, y, z);
volt2 *= vv(2, x, y, z);
volt2 +=
vi(2, x, y, z) * (
curr(1, x , y , z) -
curr(1, x-1, y , z) -
curr(0, x , y , z) +
curr(0, x , y-1, z)
);
volt(0, x, y, z) = volt0;
volt(1, x, y, z) = volt1;
volt(2, x, y, z) = volt2;
}
int main(void)
{
const unsigned int numLines[3] = {257, 257, 257};
const int timesteps = 10;
sycl::queue Q;
ArrayXYZN<float>& volt = *CreateArrayXYZN<float>(Q, numLines);
ArrayXYZN<float>& curr = *CreateArrayXYZN<float>(Q, numLines);
ArrayXYZN<float>& vv = *CreateArrayXYZN<float>(Q, numLines);
ArrayXYZN<float>& vi = *CreateArrayXYZN<float>(Q, numLines);
size_t size = numLines[0] * numLines[1] * numLines[2];
fprintf(stderr, "simulate %ld cells for %d timesteps.\n", size, timesteps);
for (int i = 0; i < timesteps; i++) {
Q.submit([&](sycl::handler &h) {
h.parallel_for<class Voltage>(
sycl::range(numLines[0] - 1, numLines[1] - 1, numLines[2] - 1),
[=](sycl::item<3> itm) {
/*
* The first cell on each dimension has data dependency
* outside the simulation box (boundary condition).
* Ignore them for now.
*/
int x = itm.get_id(0) + 1;
int y = itm.get_id(1) + 1;
int z = itm.get_id(2) + 1;
UpdateVoltages(volt, curr, vv, vi, x, y, z);
}
);
});
Q.wait();
}
}