看起来你正在进行的操作实际上是“收集”数据。现代CPU有专门的指令,特别是VPGATHER**
。在.NET Core 3中公开了这一指令,并且应该可以像下面这样工作,这是单循环情况(你可能可以从这里开始获取双循环版本);
首先是结果:
AVX enabled: False
e7ad04457529f201558c8a53f639fed30d3a880f75e613afe203e80a7317d0cb
for 524288 loops: 1524ms
AVX enabled: True
e7ad04457529f201558c8a53f639fed30d3a880f75e613afe203e80a7317d0cb
for 524288 loops: 667ms
代码:
using System;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
static class P
{
static int Gather(int[] source, int[] index, int[] results, bool avx)
{
int y = 0;
if (Avx2.IsSupported && avx)
{
var iv = MemoryMarshal.Cast<int, Vector256<int>>(index);
var rv = MemoryMarshal.Cast<int, Vector256<int>>(results);
unsafe
{
fixed (int* sPtr = source)
{
for (int i = 0; i < rv.Length; i++)
{
rv[i] = Avx2.GatherVector256(sPtr, iv[i], 4);
}
}
}
y += rv.Length * Vector256<int>.Count;
}
int result = y;
int end = results.Length;
for (; y < end; y++)
{
results[y] = source[index[y]];
}
return result;
}
static void Main()
{
var rand = new Random(12345);
int size = 1024 * 512;
int[] data = new int[size];
for (int i = 0; i < data.Length; i++)
data[i] = rand.Next(255);
int[] index = new int[1024];
for (int i = 0; i < index.Length; i++)
index[i] = rand.Next(size);
int[] results = new int[1024];
void GatherLocal(bool avx)
{
Array.Clear(results, 0, results.Length);
int from = Gather(data, index, results, avx);
Console.WriteLine($"AVX enabled: {avx}; slow loop from {from}");
for (int i = 0; i < 32; i++)
{
Console.Write(results[i].ToString("x2"));
}
Console.WriteLine();
const int TimeLoop = 1024 * 512;
var watch = Stopwatch.StartNew();
for (int i = 0; i < TimeLoop; i++)
Gather(data, index, results, avx);
watch.Stop();
Console.WriteLine($"for {TimeLoop} loops: {watch.ElapsedMilliseconds}ms");
Console.WriteLine();
}
GatherLocal(false);
if (Avx2.IsSupported) GatherLocal(true);
}
}
for
循环遍历height*width
会略微更快,但除此之外很难发现任何明显的优化机会。lookUp
中的索引分布是否有任何模式? - 500 - Internal Server Error