我目前正在处理一个复制函数,它从源字节数组填充目标字节数组,并复制源数组直到目标数组被填满(有些人称其为MemCpyReplicate或类似名称)。目标数组的长度始终是源数组长度的倍数。
我的第一次尝试是通过Unsafe.CopyBlockUnaligned
内部函数进行简单的复制,它会简单地发出rep movsb
指令:
public static void CopyRepeat(byte* destination, byte* source, int byteCount, int count) {
while(count-- > 0) {
Unsafe.CopyBlockUnaligned(destination, source, (uint)byteCount);
destination += byteCount;
}
}
由于结果不令人满意,我现在想使用SIMD,更确切地说是
Vector<T>
接口。但我不知道如何处理未对齐地址和小于向量长度的字节模式。
这将是我的理想解决方案:
源数组 -> 10字节,
向量 -> 32字节 = 3个字节模式
这些字节序列大多在1到64字节的范围内。重复次数从1到500不等。有更好的解决方案吗?或者有类似功能的示例实现吗?
更新:
我从原始版本构建了两种矢量化变体。第一种重复向量中的模式,以使向量包含
n
个模式。如果模式太大无法适应向量,则使用CopyBlock。第二个变体重复模式,直到目标中超过向量大小的字节数,然后始终复制向量大小的块(并移动源窗口),而不使用CopyBlock。
方法 |
字节数 |
次数 |
平均值 |
误差 |
标准偏差 |
Repeat_CopyBlock |
3 |
16 |
19.38 ns |
0.002 ns |
0.002 ns |
Repeat_NoCopyBlock |
3 |
16 |
13.90 ns |
0.106 ns |
0.100 ns |
Repeat_CopyBlock |
3 |
128 |
25.00 ns |
0.005 ns |
0.005 ns |
Repeat_NoCopyBlock |
3 |
128 |
39.31 ns |
0.135 ns |
0.126 ns |
Repeat_CopyBlock |
12 |
16 |
10.64 ns |
0.037 ns |
0.031 ns |
Repeat_NoCopyBlock |
12 |
16 |
13.35 ns |
0.024 ns |
0.023 ns |
Repeat_CopyBlock |
12 |
128 |
25.56 ns |
0.020 ns |
0.019 ns |
Repeat_NoCopyBlock |
12 |
128 |
108.61 ns |
0.164 ns |
0.154 ns |
Repeat_CopyBlock |
16 |
16 |
68.74 ns |
0.010 ns |
0.009 ns |
Repeat_NoCopyBlock |
16 |
16 |
13.50 ns |
0.002 ns |
0.002 ns |
Repeat_CopyBlock |
16 |
128 |
81.41 ns |
0.024 ns |
0.022 ns |
Repeat_NoCopyBlock |
16 |
128 |
public static unsafe void Repeat_NoCopyBlock(byte* destination, byte* source, int byteCount, int count) {
if(byteCount == 1) {
Unsafe.InitBlockUnaligned(destination, *source, (uint)count);
return;
}
var absoluteByteCount = byteCount * count;
var dst = destination;
var offset = 0;
do
{
if(offset == absoluteByteCount) return;
offset += byteCount;
var src = source;
var remaining = byteCount;
while((remaining & -4) != 0) {
*((uint*)dst) = *((uint*)src);
dst += 4;
src += 4;
remaining -= 4;
}
if((remaining & 2) != 0) {
*((ushort*)dst) = *((ushort*)src);
dst += 2;
src += 2;
remaining -= 2;
}
if((remaining & 1) != 0)
*dst++ = *src;
} while((offset & (2 * -Vector<byte>.Count)) == 0);
var stopLoopAtOffset = absoluteByteCount - Vector<byte>.Count;
var from = destination;
while(offset <= stopLoopAtOffset) {
Unsafe.WriteUnaligned(dst, Unsafe.ReadUnaligned<Vector<byte>>(from));
offset += Vector<byte>.Count;
from += Vector<byte>.Count;
dst += Vector<byte>.Count;
}
var rep = (offset / byteCount) * byteCount;
if(offset != absoluteByteCount) {
var repEnd = destination + rep - Vector<byte>.Count;
var dstEnd = destination + stopLoopAtOffset;
Unsafe.WriteUnaligned(dstEnd, Unsafe.ReadUnaligned<Vector<byte>>(repEnd));
}
}
public static unsafe void Repeat_CopyBlock(byte* destination, byte* source, int byteCount, int count) {
if(count == 0) return;
if(byteCount == 0) return;
if(byteCount == 1) {
Unsafe.InitBlockUnaligned(destination, *source, (uint)count);
return;
}
var numElements = Vector<byte>.Count / byteCount;
var numElementsByteCount = numElements * byteCount;
var i = 0;
var dst = destination;
do
{
var remaining = byteCount;
var src = source;
while(remaining >= 4) {
*((uint*)dst) = *((uint*)src);
dst += 4;
src += 4;
remaining -= 4;
}
if((remaining & 2) != 0) {
*((ushort*)dst) = *((ushort*)src);
dst += 2;
src += 2;
remaining -= 2;
}
if((remaining & 1) != 0)
*dst++ = *src;
++i; --count;
} while(count != 0 && i < numElements);
if(numElements > 0) {
var src = Unsafe.ReadUnaligned<Vector<byte>>(destination);
while(count > numElements) {
Unsafe.WriteUnaligned(dst, src);
count -= numElements;
dst += numElementsByteCount;
}
}
while(count > 0) {
Unsafe.CopyBlockUnaligned(dst, destination, (uint)byteCount);
dst += byteCount;
--count;
}
}
|