在.NET 4.7中比较字典查找和多个is运算符时，出现意外的性能结果。

Question

在.NET 4.7中比较字典查找和多个is运算符时，出现意外的性能结果。

9

我有一个问题，需要根据对象类型进行动态调度。我需要根据编译时已知的类型进行调度，在我的示例中它们是17。

我的初步想法是使用Dictionary<Type, Action<Object>>进行调度，并使用obj.GetType()查找适当的操作。但后来我决定使用BenchmarkDotNet来看看是否可以做得更好，以及调度查找的开销有多大。下面是我用于基准测试的代码。

public class Program
{
    private static readonly Object Value = Guid.NewGuid();
    private static readonly Dictionary<Type, Action<Object>> Dictionary = new Dictionary<Type, Action<Object>>()
    {
        [ typeof( Byte ) ] = x => Empty( (Byte)x ),
        [ typeof( Byte[] ) ] = x => Empty( (Byte[])x ),
        [ typeof( SByte ) ] = x => Empty( (SByte)x ),
        [ typeof( Int16 ) ] = x => Empty( (Int16)x ),
        [ typeof( UInt16 ) ] = x => Empty( (UInt16)x ),
        [ typeof( Int32 ) ] = x => Empty( (Int32)x ),
        [ typeof( UInt32 ) ] = x => Empty( (UInt32)x ),
        [ typeof( Int64 ) ] = x => Empty( (Int64)x ),
        [ typeof( UInt64 ) ] = x => Empty( (UInt64)x ),
        [ typeof( Decimal ) ] = x => Empty( (Decimal)x ),
        [ typeof( Single ) ] = x => Empty( (Single)x ),
        [ typeof( Double ) ] = x => Empty( (Double)x ),
        [ typeof( String ) ] = x => Empty( (String)x ),
        [ typeof( DateTime ) ] = x => Empty( (DateTime)x ),
        [ typeof( TimeSpan ) ] = x => Empty( (TimeSpan)x ),
        [ typeof( Guid ) ] = x => Empty( (Guid)x ),
        [ typeof( Char ) ] = x => Empty( (Char)x ),
    };


    [Benchmark]
    public void Switch() => Switch( Value );


    [Benchmark]
    public void Lookup() => Lookup( Value );


    private static void Switch( Object value )
    {
        if ( value is Byte ) goto L_Byte;
        if ( value is SByte ) goto L_SByte;
        if ( value is Int16 ) goto L_Int16;
        if ( value is UInt16 ) goto L_UInt16;
        if ( value is Int32 ) goto L_Int32;
        if ( value is UInt32 ) goto L_UInt32;
        if ( value is Int64 ) goto L_Int64;
        if ( value is UInt64 ) goto L_UInt64;
        if ( value is Decimal ) goto L_Decimal;
        if ( value is Single ) goto L_Single;
        if ( value is Double ) goto L_Double;
        if ( value is DateTime ) goto L_DateTime;
        if ( value is TimeSpan ) goto L_TimeSpan;
        if ( value is DateTimeOffset ) goto L_DateTimeOffset;
        if ( value is String ) goto L_String;
        if ( value is Byte[] ) goto L_ByteArray;
        if ( value is Char ) goto L_Char;
        if ( value is Guid ) goto L_Guid;

        return;

        L_Byte: Empty( (Byte)value ); return;
        L_SByte: Empty( (SByte)value ); return;
        L_Int16: Empty( (Int16)value ); return;
        L_UInt16: Empty( (UInt16)value ); return;
        L_Int32: Empty( (Int32)value ); return;
        L_UInt32: Empty( (UInt32)value ); return;
        L_Int64: Empty( (Int64)value ); return;
        L_UInt64: Empty( (UInt64)value ); return;
        L_Decimal: Empty( (Decimal)value ); return;
        L_Single: Empty( (Single)value ); return;
        L_Double: Empty( (Double)value ); return;
        L_DateTime: Empty( (DateTime)value ); return;
        L_DateTimeOffset: Empty( (DateTimeOffset)value ); return;
        L_TimeSpan: Empty( (TimeSpan)value ); return;
        L_String: Empty( (String)value ); return;
        L_ByteArray: Empty( (Byte[])value ); return;
        L_Char: Empty( (Char)value ); return;
        L_Guid: Empty( (Guid)value ); return;
    }


    private static void Lookup( Object value )
    {
        if ( Dictionary.TryGetValue( value.GetType(), out var action ) )
        {
            action( value );
        }
    }


    [MethodImpl( MethodImplOptions.NoInlining )]
    private static void Empty<T>( T value ) { }


    static void Main( string[] args )
    {
        BenchmarkRunner.Run( typeof( Program ) );

        Console.ReadLine();
    }
}

在我的示例中，我使用了一个封装的Guid来运行测试，这是手工制作开关函数中最糟糕的情况。结果令人惊讶，可以这么说：

BenchmarkDotNet=v0.10.11, OS=Windows 10 Redstone 3 [1709, Fall Creators Update] (10.0.16299.125)
    Processor=Intel Core i7-4790K CPU 4.00GHz (Haswell), ProcessorCount=8
    Frequency=3903988 Hz, Resolution=256.1483 ns, Timer=TSC
      [Host]     : .NET Framework 4.7 (CLR 4.0.30319.42000), 32bit LegacyJIT-v4.7.2600.0
      DefaultJob : .NET Framework 4.7 (CLR 4.0.30319.42000), 32bit LegacyJIT-v4.7.2600.0

     Method |     Mean |     Error |    StdDev |
    ------- |---------:|----------:|----------:|
     Switch | 13.21 ns | 0.1057 ns | 0.0989 ns |
     Lookup | 28.22 ns | 0.1082 ns | 0.1012 ns |

这个开关函数在最坏情况下运行速度快了两倍。如果我重新排列if语句，使最常见的类型排在前面，那么平均来说，我希望它能跑得更快3-5倍。

我的问题是为什么18次检查比单个字典查找要快这么多？我有没有漏掉一些明显的东西？

编辑：

原来的测试是在x64机器上的x86（优先32位）模式下进行的。我也在64版中进行了测试：

    Method |      Mean |     Error |    StdDev |
---------- |----------:|----------:|----------:|
    Switch | 12.451 ns | 0.0600 ns | 0.0561 ns |
    Lookup | 22.552 ns | 0.1108 ns | 0.1037 ns |

- Ivan Zlatanov

1

这是在发布版本上吗？ - mjwills

1

@mjwills 是的。运行时 = .NET Framework 4.7 (CLR 4.0.30319.42000)，32位 LegacyJIT-v4.7.2600.0；GC = 并发工作站（优先32位）在64位机器上。我也会在64位模式下进行测试。 - Ivan Zlatanov

@IvanZlatanov 那我怀疑分支预测是开关如此快的原因。尝试在每次迭代中给它一个随机类型而不是相同的类型。 - Rotem

@Rotem 不是这样的情况。我进行了几次测试——一次使用超过8个不同值的for循环，另一次使用Rand() % 8，两种情况下总体性能差异都增加了——主要是因为我们现在正在处理不是switch函数最坏情况的情况。 - Ivan Zlatanov

对这种非常快的代码进行分析是棘手的，我使用老式秒表获得非常不同的测量结果。我发现Lookup比Switch快约10%。测试中看不到的是成本去哪里了，GetType()几乎占用了一半的执行时间。也许直观而明显的是，Type是一个大对象。它确实被缓存，相当于一个字典查找 :) 但是，is运算符并不糟糕，只是在对象头中与类型句柄进行简单比较。在这个测试中，由于从来没有匹配，17个分支预测得非常好。 - Hans Passant

显示剩余2条评论

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Rhumborl · Accepted Answer

我绝不是一个IL性能大师，但如果你反编译并特别查看IL，这就有意义了。 is 运算符只有 4 个操作码（ldarg、isinst、ldnull、cgt），每个 switch 部分总共只有 7 个操作码，加上 goto 后为 17*7+6 = 125 最多。

相比之下，Dictionary.TryGetValue 可能只有一个方法调用，但在其中它执行了很多工作：哈希、循环和比较值。请参见：http://referencesource.microsoft.com/#mscorlib/system/collections/generic/dictionary.cs,2e5bc6d8c0f21e67。

public bool TryGetValue(TKey key, out TValue value) {
    int i = FindEntry(key);
    if (i >= 0) {
        value = entries[i].value;
        return true;
    }
    value = default(TValue);
    return false;
}

http://referencesource.microsoft.com/#mscorlib/system/collections/generic/dictionary.cs,bcd13bb775d408f1

private int FindEntry(TKey key) {
    if( key == null) {
        ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
    }

    if (buckets != null) {
        int hashCode = comparer.GetHashCode(key) & 0x7FFFFFFF;
        for (int i = buckets[hashCode % buckets.Length]; i >= 0; i = entries[i].next) {
            if (entries[i].hashCode == hashCode && comparer.Equals(entries[i].key, key)) return i;
        }
    }
    return -1;
}

FindEntry函数内部的for循环单独占用了31个操作码，仅此部分就最多使用了527个操作码。

这是非常基本的分析，但很容易看出switch语句应该更具性能。然而，通常情况下，您需要考虑性能与可读性和可维护性之间的平衡。如果使用字典查找可以提供更好的代码，则很少有性能损失（15ns）会超过该优点。