请注意,我不信任“在模块加载时禁止JIT优化”选项,我会在不调试的情况下生成进程,并在JIT运行后附加我的调试器。
在单行运行更快的版本中,这是Main
:
SingleLineTest()
00000000 push ebp
00000001 mov ebp,esp
00000003 call dword ptr ds:[0019380Ch]
MultiLineTest()
00000009 call dword ptr ds:[00193818h]
SingleLineTest()
0000000f call dword ptr ds:[0019380Ch]
MultiLineTest()
00000015 call dword ptr ds:[00193818h]
SingleLineTest()
0000001b call dword ptr ds:[0019380Ch]
MultiLineTest()
00000021 call dword ptr ds:[00193818h]
00000027 pop ebp
}
00000028 ret
请注意,MultiLineTest
已经放置在 8 字节边界上,而 SingleLineTest
则放置在 4 字节边界上。
下面是两个同时运行的版本的 Main
:
MultiLineTest()
00000000 push ebp
00000001 mov ebp,esp
00000003 call dword ptr ds:[00153818h]
SingleLineTest()
00000009 call dword ptr ds:[0015380Ch]
MultiLineTest()
0000000f call dword ptr ds:[00153818h]
SingleLineTest()
00000015 call dword ptr ds:[0015380Ch]
MultiLineTest()
0000001b call dword ptr ds:[00153818h]
SingleLineTest()
00000021 call dword ptr ds:[0015380Ch]
MultiLineTest()
00000027 call dword ptr ds:[00153818h]
0000002d pop ebp
}
0000002e ret
令人惊奇的是,JIT选择的地址在最后4位相同,即使它声称按相反顺序处理它们。不确定我还是否相信。
需要更深入的挖掘。我认为提到了循环之前的代码在两个版本中并不完全相同?我要进行调查。
这是SingleLineTest
的“慢速”版本(我检查过,函数地址的最后几位没有改变)。
Stopwatch stopwatch = new Stopwatch()
00000000 push ebp
00000001 mov ebp,esp
00000003 push edi
00000004 push esi
00000005 push ebx
00000006 mov ecx,7A5A2C68h
0000000b call FFF91EA0
00000010 mov esi,eax
00000012 mov dword ptr [esi+4],0
00000019 mov dword ptr [esi+8],0
00000020 mov byte ptr [esi+14h],0
00000024 mov dword ptr [esi+0Ch],0
0000002b mov dword ptr [esi+10h],0
stopwatch.Start()
00000032 cmp byte ptr [esi+14h],0
00000036 jne 00000047
00000038 call 7A22B314
0000003d mov dword ptr [esi+0Ch],eax
00000040 mov dword ptr [esi+10h],edx
00000043 mov byte ptr [esi+14h],1
int count = 0
00000047 xor edi,edi
for (uint i = 0
00000049 xor edx,edx
count += i % 16 == 0 ? 1 : 0
0000004b mov eax,edx
0000004d and eax,0Fh
00000050 test eax,eax
00000052 je 00000058
00000054 xor eax,eax
00000056 jmp 0000005D
00000058 mov eax,1
0000005d add edi,eax
for (uint i = 0
0000005f inc edx
00000060 cmp edx,3B9ACA00h
00000066 jb 0000004B
}
stopwatch.Stop()
00000068 mov ecx,esi
0000006a call 7A23F2C0
Console.WriteLine("Single-line test --> Count: {0}, Time: {1}", count, stopwatch.ElapsedMilliseconds)
0000006f mov ecx,797C29B4h
00000074 call FFF91EA0
00000079 mov ecx,eax
0000007b mov dword ptr [ecx+4],edi
0000007e mov ebx,ecx
00000080 mov ecx,797BA240h
00000085 call FFF91EA0
0000008a mov edi,eax
0000008c mov ecx,esi
0000008e call 7A23ABE8
00000093 push edx
00000094 push eax
00000095 push 0
00000097 push 2710h
0000009c call 783247EC
000000a1 mov dword ptr [edi+4],eax
000000a4 mov dword ptr [edi+8],edx
000000a7 mov esi,edi
000000a9 call 793C6F40
000000ae push ebx
000000af push esi
000000b0 mov ecx,eax
000000b2 mov edx,dword ptr ds:[03392034h]
000000b8 mov eax,dword ptr [ecx]
000000ba mov eax,dword ptr [eax+3Ch]
000000bd call dword ptr [eax+1Ch]
000000c0 pop ebx
}
000000c1 pop esi
000000c2 pop edi
000000c3 pop ebp
000000c4 ret
还有“快速”版本:
Stopwatch stopwatch = new Stopwatch()
00000000 push ebp
00000001 mov ebp,esp
00000003 push edi
00000004 push esi
00000005 push ebx
00000006 mov ecx,7A5A2C68h
0000000b call FFE11F70
00000010 mov esi,eax
00000012 mov ecx,esi
00000014 call 7A1068BC
stopwatch.Start()
00000019 cmp byte ptr [esi+14h],0
0000001d jne 0000002E
0000001f call 7A12B3E4
00000024 mov dword ptr [esi+0Ch],eax
00000027 mov dword ptr [esi+10h],edx
0000002a mov byte ptr [esi+14h],1
int count = 0
0000002e xor edi,edi
for (uint i = 0
00000030 xor edx,edx
count += i % 16 == 0 ? 1 : 0
00000032 mov eax,edx
00000034 and eax,0Fh
00000037 test eax,eax
00000039 je 0000003F
0000003b xor eax,eax
0000003d jmp 00000044
0000003f mov eax,1
00000044 add edi,eax
for (uint i = 0
00000046 inc edx
00000047 cmp edx,3B9ACA00h
0000004d jb 00000032
}
stopwatch.Stop()
0000004f mov ecx,esi
00000051 call 7A13F390
Console.WriteLine("Single-line test --> Count: {0}, Time: {1}", count, stopwatch.ElapsedMilliseconds)
00000056 mov ecx,797C29B4h
0000005b call FFE11F70
00000060 mov ecx,eax
00000062 mov dword ptr [ecx+4],edi
00000065 mov ebx,ecx
00000067 mov ecx,797BA240h
0000006c call FFE11F70
00000071 mov edi,eax
00000073 mov ecx,esi
00000075 call 7A13ACB8
0000007a push edx
0000007b push eax
0000007c push 0
0000007e push 2710h
00000083 call 782248BC
00000088 mov dword ptr [edi+4],eax
0000008b mov dword ptr [edi+8],edx
0000008e mov esi,edi
00000090 call 792C7010
00000095 push ebx
00000096 push esi
00000097 mov ecx,eax
00000099 mov edx,dword ptr ds:[03562030h]
0000009f mov eax,dword ptr [ecx]
000000a1 mov eax,dword ptr [eax+3Ch]
000000a4 call dword ptr [eax+1Ch]
000000a7 pop ebx
}
000000a8 pop esi
000000a9 pop edi
000000aa pop ebp
000000ab ret
只有循环,左边快,右边慢:
00000030 xor edx,edx 00000049 xor edx,edx
00000032 mov eax,edx 0000004b mov eax,edx
00000034 and eax,0Fh 0000004d and eax,0Fh
00000037 test eax,eax 00000050 test eax,eax
00000039 je 0000003F 00000052 je 00000058
0000003b xor eax,eax 00000054 xor eax,eax
0000003d jmp 00000044 00000056 jmp 0000005D
0000003f mov eax,1 00000058 mov eax,1
00000044 add edi,eax 0000005d add edi,eax
00000046 inc edx 0000005f inc edx
00000047 cmp edx,3B9ACA00h 00000060 cmp edx,3B9ACA00h
0000004d jb 00000032 00000066 jb 0000004B
指令是相同的(作为相对跳转,机器码相同,即使反汇编显示不同的地址),但对齐方式不同。有三个跳转。在“慢”版本中对常数1进行加载的
je
被对齐,而在“快”版本中则没有,但这几乎没有关系,因为该跳转只执行1/16的时间。另外两个跳转(在加载零常数后的
jmp
和重复整个循环的
jb
)执行了数百万次,并在“快速”版本中对齐。
我认为这就是确凿证据。
SingleLineTest
进行测试得到平均值分别为1412ms和1490ms,对MultiLineTest
进行测试得到平均值分别为1773ms和1792ms。这意味着速度提升了26%和20%。对于每台计算机,速度提升的标准差为2%。我本来期望会看到一些机器之间的差异,但8%的差异令人惊讶。 - Edward Brey