F# NativePtr.stackalloc比C# stackalloc慢 - 包含反编译代码

6
继续进行F#性能测试。有关更多背景信息,请参见此处:
f# NativePtr.stackalloc in Struct Constructor
F# NativePtr.stackalloc Unexpected Stack Overflow
现在我已经让堆栈数组在F#中工作了。然而,出于某种原因,等效的C#速度大约快50倍。我已经包含了以下ILSpy反编译版本,它似乎只有1行真正不同(在stackAlloc内部)。这里发生了什么?难道未经检查的算术确实导致了如此大的差异吗?不确定我该如何测试这个?
https://msdn.microsoft.com/en-us/library/a569z7k8.aspx
F#代码:
#nowarn "9"

open Microsoft.FSharp.NativeInterop
open System
open System.Diagnostics    
open System.Runtime.CompilerServices        

[<MethodImpl(MethodImplOptions.NoInlining)>]
let stackAlloc x =
    let mutable ints:nativeptr<byte> = NativePtr.stackalloc x
    ()   

[<EntryPoint>]
let main argv = 
    printfn "%A" argv

    let size = 8192            
    let reps = 10000

    stackAlloc size // JIT
    let clock = Stopwatch()
    clock.Start()
    for i = 1 to reps do            
        stackAlloc size
    clock.Stop()

    let elapsed = clock.Elapsed.TotalMilliseconds
    let description = "F# NativePtr.stackalloc"
    Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed)

    Console.ReadKey() |> ignore
    0

C# 代码

using System;
using System.Diagnostics;

namespace CSharpLanguageFeatures
{
    class CSharpStackArray
    {
        static void Main(string[] args)
        {
            int size = 8192;
            int reps = 10000;

            stackAlloc(size); // JIT
            Stopwatch clock = new Stopwatch();
            clock.Start();
            for (int i = 0; i < reps; i++)
            {
                stackAlloc(size);
            }
            clock.Stop();

            string elapsed = clock.Elapsed.TotalMilliseconds.ToString("#,##0.####");
            string description = "C# stackalloc";
            Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed);
            Console.ReadKey();
        }

        public unsafe static void stackAlloc(int arraySize)
        {
            byte* pArr = stackalloc byte[arraySize];
        }
    }
}
using Microsoft.FSharp.Core;
using System;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;

[CompilationMapping(SourceConstructFlags.Module)]
public static class FSharpStackArray
{
    [MethodImpl(MethodImplOptions.NoInlining)]
    public unsafe static void stackAlloc(int x)
    {
        IntPtr ints = stackalloc byte[x * sizeof(byte)];
    }

    [EntryPoint]
    public static int main(string[] argv)
    {
        PrintfFormat<FSharpFunc<string[], Unit>, TextWriter, Unit, Unit> format = new PrintfFormat<FSharpFunc<string[], Unit>, TextWriter, Unit, Unit, string[]>("%A");
        PrintfModule.PrintFormatLineToTextWriter<FSharpFunc<string[], Unit>>(Console.Out, format).Invoke(argv);
        FSharpStackArray.stackAlloc(8192);
        Stopwatch clock = new Stopwatch();
        clock.Start();
        for (int i = 1; i < 10001; i++)
        {
            FSharpStackArray.stackAlloc(8192);
        }
        clock.Stop();
        double elapsed = clock.Elapsed.TotalMilliseconds;
        Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", "F# NativePtr.stackalloc", 8192, 10000, elapsed);
        ConsoleKeyInfo consoleKeyInfo = Console.ReadKey();
        return 0;
    }
}

C# 版本反编译

using System;
using System.Diagnostics;

namespace CSharpLanguageFeatures
{
    internal class CSharpStackArray
    {
        private static void Main(string[] args)
        {
            int size = 8192;
            int reps = 10000;
            CSharpStackArray.stackAlloc(size);
            Stopwatch clock = new Stopwatch();
            clock.Start();
            for (int i = 0; i < reps; i++)
            {
                CSharpStackArray.stackAlloc(size);
            }
            clock.Stop();
            string elapsed = clock.Elapsed.TotalMilliseconds.ToString("#,##0.####");
            string description = "C# stackalloc";
            Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", new object[]
            {
                description,
                size,
                reps,
                elapsed
            });
            Console.ReadKey();
        }

        public unsafe static void stackAlloc(int arraySize)
        {
            IntPtr arg_06_0 = stackalloc byte[checked(unchecked((UIntPtr)arraySize) * 1)];
        }
    }
}

F#版本的IL字节分配

.method public static 
    void stackAlloc (
        int32 x
    ) cil managed noinlining 
{
    // Method begins at RVA 0x2050
    // Code size 13 (0xd)
    .maxstack 4
    .locals init (
        [0] native int ints
    )

    IL_0000: nop
    IL_0001: ldarg.0
    IL_0002: sizeof [mscorlib]System.Byte
    IL_0008: mul
    IL_0009: localloc
    IL_000b: stloc.0
    IL_000c: ret
} // end of method FSharpStackArray::stackAlloc

C# 版本 IL - 字节分配

.method public hidebysig static 
    void stackAlloc (
        int32 arraySize
    ) cil managed 
{
    // Method begins at RVA 0x2094
    // Code size 8 (0x8)
    .maxstack 8

    IL_0000: ldarg.0
    IL_0001: conv.u
    IL_0002: ldc.i4.1
    IL_0003: mul.ovf.un
    IL_0004: localloc
    IL_0006: pop
    IL_0007: ret
} // end of method CSharpStackArray::stackAlloc   

更新的F# IL - IntPtr分配

(该内容是关于IT技术方面的)
.method public static 
    void stackAlloc (
        int32 x
    ) cil managed noinlining 
{
    // Method begins at RVA 0x2050
    // Code size 13 (0xd)
    .maxstack 4
    .locals init (
        [0] native int ints
    )

    IL_0000: nop
    IL_0001: ldarg.0
    IL_0002: sizeof [mscorlib]System.IntPtr
    IL_0008: mul
    IL_0009: localloc
    IL_000b: stloc.0
    IL_000c: ret
} // end of method FSharpStackArray::stackAlloc

更新的C# IL - IntPtr分配

.method public hidebysig static 
    void stackAlloc (
        int32 arraySize
    ) cil managed 
{
    // Method begins at RVA 0x2415
    // Code size 13 (0xd)
    .maxstack 8

    IL_0000: ldarg.0
    IL_0001: conv.u
    IL_0002: sizeof [mscorlib]System.IntPtr
    IL_0008: mul.ovf.un
    IL_0009: localloc
    IL_000b: pop
    IL_000c: ret
} // end of method CSharpStackArray::stackAlloc

2
你能展示一下 stackAlloc 方法的实际 IL 吗? - Fyodor Soikin
可能是 [mscorlib]System.Byte 的 sizeof 或者 nop?循环中也有一个 nop? - Researcher
尝试使用在不同架构上会有所不同的类型,比如 intptr 本身。似乎 C# 编译器通过使用 sizeof(byte) 总是等于 1 的知识来优化这个计算。 - Fyodor Soikin
更改为IntPtr分配使C#版本的执行时间比原来长约2倍。它使F#版本的执行时间比原来长10倍。 - Researcher
我注意到F#被标记为noinline。将其添加到C#版本中,但没有太大的区别。在这种情况下,确实需要看到执行的确切x64代码。不幸的是,JIT:er会根据是否连接了调试器来更改JIT:ed代码,这使得这一点更加困难。可以在循环内部生成崩溃并获取内存转储,然后使用windebug检查该代码。 - Just another metaprogrammer
显示剩余8条评论
1个回答

6
感谢大家对此的帮助。
答案是C#编译器未将指针存储为本地变量。这是因为分配的内存从未被使用。缺乏"sizeof"和不同的"mul"使得C#另外稍微占了一些优势。
F#汇编程序——区别已经注释说明。
.method public static 
    void stackAlloc (
        int32 x
    ) cil managed noinlining 
{
    // Method begins at RVA 0x2050
    // Code size 13 (0xd)
    .maxstack 4
    .locals init ( //***** Not in C# Version *****//
        [0] native int ints
    )

    IL_0000: nop
    IL_0001: ldarg.0
    IL_0002: sizeof [mscorlib]System.Byte //***** C# just uses "1" *****//
    IL_0008: mul //***** C# uses "mul.ovf.un" *****//
    IL_0009: localloc
    IL_000b: stloc.0 //***** Not in C# Version *****//
    IL_000c: ret
} // end of method FSharpStackArray::stackAlloc

C#汇编器 - 差异已注释

.method public hidebysig static 
    void stackAlloc (
        int32 arraySize
    ) cil managed 
{
    // Method begins at RVA 0x2094
    // Code size 8 (0x8)
    .maxstack 8

    IL_0000: ldarg.0
    IL_0001: conv.u
    IL_0002: ldc.i4.1 //***** F# uses sizeof [mscorlib]System.Byte *****//
    IL_0003: mul.ovf.un //***** F# uses "mul" *****//
    IL_0004: localloc
    IL_0006: pop
    IL_0007: ret
} // end of method CSharpStackArray::stackAlloc  

这个练习教会了我一些东西:
  1. 编译器执行很多优化。显然,不同语言中的相同高级代码可能导致完全不同的机器指令。
  2. 在对dotnet语言进行基准测试时,您可以阅读中间程序集以真正了解发生了什么。使用ILSpy。
  3. 您可以使用ilasm.exe修改和编译中间程序集。
  4. C#编译器在消除不必要的代码方面做得更好。一旦您设置了分配内存中的每个字节,性能就会变得非常类似于最初预期的。

最终F#代码

#nowarn "9"

open Microsoft.FSharp.NativeInterop
open System
open System.Diagnostics    
open System.Runtime.CompilerServices        

[<MethodImpl(MethodImplOptions.NoInlining)>]
let stackAlloc x =
    let mutable bytes:nativeptr<byte> = NativePtr.stackalloc x
    for i = 0 to (x - 1) do
        NativePtr.set bytes i (byte i)
    ()   

[<EntryPoint>]
let main argv = 
    printfn "%A" argv

    let size = 8192            
    let reps = 10000

    stackAlloc size // JIT
    let clock = Stopwatch()
    clock.Start()
    for i = 1 to reps do            
        stackAlloc size
    clock.Stop()

    let elapsed = clock.Elapsed.TotalMilliseconds
    let description = "F# NativePtr.stackalloc"
    Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed)

    Console.ReadKey() |> ignore
    0

C#最终代码

using System;
using System.Diagnostics;

namespace CSharpStackArray
{
    class Program
    {
        static void Main(string[] args)
        {
            int size = 8192;
            int reps = 10000;

            stackAlloc(size); // JIT
            Stopwatch clock = new Stopwatch();
            clock.Start();
            for (int i = 0; i < reps; i++)
            {
                stackAlloc(size);
            }
            clock.Stop();

            string elapsed = clock.Elapsed.TotalMilliseconds.ToString("#,##0.####");
            string description = "C# stackalloc";
            Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed);
            Console.ReadKey();
        }

        public unsafe static void stackAlloc(int arraySize)
        {
            byte* pArr = stackalloc byte[arraySize];
            for (int i = 0; i < arraySize; i++)
            {
                pArr[i] = (byte)i;
            }
        }
    }
}

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接