声明为static
的局部变量在整个程序运行期间具有生命周期,并通常存储在数据段中。编译器通过拥有一个包含这些值的部分来实现此功能。
未声明为静态的局部变量通常位于堆栈上,并且每次进入变量作用域时必须进行初始化。
查看static
情况下的汇编代码,MSVC 2015输出以下内容:
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.00.24215.1
TITLE MyLBP.c
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
CONST SEGMENT
?Arr@?1??tfuuuuuuu@@9@9 DQ 04060c00000000000r ; 134 ; `tfuuuuuuu'::`2'::Arr
DQ 03fe15efd20a7955br ; 0.542845
DQ 03fdf59701e4b19afr ; 0.489834
DQ 0bfd8e38e9ab7fcb1r ; -0.388889
DQ 0bfe59f22c01e68a1r ; -0.675676
DQ 0bfeb13b15d5aa410r ; -0.846154
DQ 0bfe2c2355f07776er ; -0.586207
DQ 03fefffffbf935359r ; 1
...
ORG $+1036128
CONST ENDS
PUBLIC _tfuuuuuuu
EXTRN __fltused:DWORD
; Function compile flags: /Odtp
_TEXT SEGMENT
_Ind$ = 8 ; size = 4
_tfuuuuuuu PROC
; File c:\users\dennis bush\documents\x2.c
; Line 4
push ebp
mov ebp, esp
; Line 106
mov eax, DWORD PTR _Ind$[ebp]
fld QWORD PTR ?Arr@?1??tfuuuuuuu@@9@9[eax*8]
; Line 107
pop ebp
ret 0
_tfuuuuuuu ENDP
_TEXT ENDS
END
gcc 4.8.5输出如下内容:
.file "MyLBP.c"
.text
.globl tfuuuuuuu
.type tfuuuuuuu, @function
tfuuuuuuu:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
cltq
movq Arr.1724(,%rax,8), %rax
movq %rax, -16(%rbp)
movsd -16(%rbp), %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size tfuuuuuuu, .-tfuuuuuuu
.section .rodata
.align 32
.type Arr.1724, @object
.size Arr.1724, 1238400
Arr.1724:
.long 0
.long 1080082432
.long 547853659
.long 1071734525
.long 508238255
.long 1071602032
.long 2595749041
.long -1076305010
.long 3223218337
...
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-16)"
.section .note.GNU-stack,"",@progbits
因此,两者都全局定义数据并直接引用该全局数组。
现在让我们看看非静态代码。首先是VSMC2015:
TITLE MyLBP.c
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC _tfuuuuuuu
PUBLIC __real@3e45798ee2308c3a
PUBLIC __real@3f40e1cf9350aa3c
PUBLIC __real@3f43b1f90beff84b
PUBLIC __real@3f4c6220dc6e8066
PUBLIC __real@3f4ea4c648794089
PUBLIC __real@3f50023666188dc0
PUBLIC __real@3f53957e56f300e9
PUBLIC __real@3f55235d7d33b25f
PUBLIC __real@3f5828f66e5bd33a
PUBLIC __real@3f5c044284dfce31
PUBLIC __real@3f5c87c05341c674
...
EXTRN @__security_check_cookie@4:PROC
EXTRN __chkstk:PROC
EXTRN _memset:PROC
EXTRN ___security_cookie:DWORD
EXTRN __fltused:DWORD
CONST SEGMENT
__real@bff0000000000000 DQ 0bff0000000000000r
CONST ENDS
CONST SEGMENT
__real@bfefffffdfc9a9ad DQ 0bfefffffdfc9a9adr
CONST ENDS
CONST SEGMENT
__real@bfefffffbf935359 DQ 0bfefffffbf935359r
CONST ENDS
CONST SEGMENT
__real@bfefffff9f5cfd06 DQ 0bfefffff9f5cfd06r
CONST ENDS
CONST SEGMENT
__real@bfefffff7f26a6b3 DQ 0bfefffff7f26a6b3r
CONST ENDS
CONST SEGMENT
__real@bfefffff5ef05060 DQ 0bfefffff5ef05060r
CONST ENDS
...
_TEXT SEGMENT
_Arr$ = -1238404
__$ArrayPad$ = -4
_Ind$ = 8
_tfuuuuuuu PROC
push ebp
mov ebp, esp
mov eax, 1238404
call __chkstk
mov eax, DWORD PTR ___security_cookie
xor eax, ebp
mov DWORD PTR __$ArrayPad$[ebp], eax
movsd xmm0, QWORD PTR __real@4060c00000000000
movsd QWORD PTR _Arr$[ebp], xmm0
movsd xmm0, QWORD PTR __real@3fe15efd20a7955b
movsd QWORD PTR _Arr$[ebp+8], xmm0
movsd xmm0, QWORD PTR __real@3fdf59701e4b19af
movsd QWORD PTR _Arr$[ebp+16], xmm0
movsd xmm0, QWORD PTR __real@bfd8e38e9ab7fcb1
movsd QWORD PTR _Arr$[ebp+24], xmm0
movsd xmm0, QWORD PTR __real@bfe59f22c01e68a1
movsd QWORD PTR _Arr$[ebp+32], xmm0
movsd xmm0, QWORD PTR __real@bfeb13b15d5aa410
movsd QWORD PTR _Arr$[ebp+40], xmm0
movsd xmm0, QWORD PTR __real@bfe2c2355f07776e
movsd QWORD PTR _Arr$[ebp+48], xmm0
...
push 1036128
push 0
lea eax, DWORD PTR _Arr$[ebp+202272]
push eax
call _memset
add esp, 12
mov ecx, DWORD PTR _Ind$[ebp]
fld QWORD PTR _Arr$[ebp+ecx*8]
mov ecx, DWORD PTR __$ArrayPad$[ebp]
xor ecx, ebp
call @__security_check_cookie@4
mov esp, ebp
pop ebp
ret 0
_tfuuuuuuu ENDP
_TEXT ENDS
END
初始值仍然全局存储。不过请注意每个值在内部都有一个名称,并且为数组中的每个值生成了2次移动指令。创造这些名称和明确的移动操作是导致代码生成时间如此漫长的原因。
现在是gcc 4.8.5版本:
.file "MyLBP.c"
.section .rodata
.align 32
.LC0:
.long 0
.long 1080082432
.long 547853659
.long 1071734525
.long 508238255
.long 1071602032
.long 2595749041
.long -1076305010
.long 3223218337
.long -1075470558
...
.text
.globl tfuuuuuuu
.type tfuuuuuuu, @function
tfuuuuuuu:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $1238416, %rsp
movl %edi, -1238404(%rbp)
leaq -1238400(%rbp), %rax
movl $.LC0, %ecx
movl $1238400, %edx
movq %rcx, %rsi
movq %rax, %rdi
call memcpy
movl -1238404(%rbp), %eax
cltq
movq -1238400(%rbp,%rax,8), %rax
movq %rax, -1238416(%rbp)
movsd -1238416(%rbp), %xmm0
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size tfuuuuuuu, .-tfuuuuuuu
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-16)"
.section .note.GNU-stack,"",@progbits
与其生成每个值的显式复制指令,gcc只是调用
memcpy
将全局数据中的值复制到本地数组中,因此生成初始化代码的速度更快。
所以故事的寓意是,MSVC在初始化局部变量时非常低效。
此外,正如评论中所指出的那样,这是一个
已确认的错误,将在VS 2019中修复。