根据以下测试用例,针对AMD MI250 (gfx90a)生成的汇编代码与使用或不使用"simd"相同。不过,如果你查看CPU代码,你将会看到一个明显的变化,这种情况下,使用"simd"语句可以实现类似于明确使用"restrict"关键字的优化。
简而言之,目前"simd"语句是无关紧要的,并且即使在非常琐碎的情况下也会导致以下警告:
"loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]"。
#include <cstdint>
#define RESTRICT __restrict
using Float = double;
void test0_0(Float* a, const Float* b) {
a[0] = b[0] * b[0];
a[1] = b[0];
}
void test0_1(Float* RESTRICT a, const Float* RESTRICT b) {
a[0] = b[0] * b[0];
a[1] = b[0];
}
void test1_0(Float* a, Float* b, std::size_t length) {
#pragma omp parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
a[i + 1] = b[i + 0];
}
}
void test1_1(Float* a, Float* b, std::size_t length) {
#pragma omp parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
a[i + 1] = b[i + 0];
}
}
void test2_0(Float* a, Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
a[i + 1] = b[i + 0];
}
}
void test2_1(Float* RESTRICT a, Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
a[i + 1] = b[i + 0];
}
}
void test3_0(Float* a, const Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
a[i + 1] = b[i + 0];
}
}
void test3_1(Float* RESTRICT a, const Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
a[i + 1] = b[i + 0];
}
}
test2_1(Float* RESTRICT a, Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
a[i + 1] = b[i + 0];
}
}
void test3_0(Float* a, const Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
a[i + 1] = b[i + 0];
}
}
void test3_1(Float* RESTRICT a, const Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
a[i + 1] = b[i + 0];
}
}
代码可在以下链接找到:https://godbolt.org/z/sMY48s8jz
simd
开关运行程序?编译结果或性能是否有任何差异? - Fra93