好的,我有这段代码:
constexpr unsigned N = 1000;
void f1(char* sum, char* a, char* b) {
for(int i = 0; i < N; ++i) {
sum[i] = a[i] + b[i];
}
}
void f2(char* sum, char* a, char* b) {
char* end = sum + N;
while(sum != end) {
*sum++ = *a++ + *b++;
}
}
我想查看GCC 4.7.2将生成的代码。因此我运行了 g++ -march=native -O3 -masm=intel -S a.c++ -std=c++11
,并得到以下输出:
.file "a.c++"
.intel_syntax noprefix
.text
.p2align 4,,15
.globl _Z2f1PcS_S_
.type _Z2f1PcS_S_, @function
_Z2f1PcS_S_:
.LFB0:
.cfi_startproc
lea rcx, [rdx+16]
lea rax, [rdi+16]
cmp rdi, rcx
setae r8b
cmp rdx, rax
setae cl
or cl, r8b
je .L5
lea rcx, [rsi+16]
cmp rdi, rcx
setae cl
cmp rsi, rax
setae al
or cl, al
je .L5
xor eax, eax
.p2align 4,,10
.p2align 3
.L3:
movdqu xmm0, XMMWORD PTR [rdx+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddb xmm0, xmm1
movdqu XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, 992
jne .L3
mov ax, 8
mov r9d, 992
.L2:
sub eax, 1
lea rcx, [rdx+r9]
add rdi, r9
lea r8, [rax+1]
add rsi, r9
xor eax, eax
.p2align 4,,10
.p2align 3
.L4:
movzx edx, BYTE PTR [rcx+rax]
add dl, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], dl
add rax, 1
cmp rax, r8
jne .L4
rep
ret
.L5:
mov eax, 1000
xor r9d, r9d
jmp .L2
.cfi_endproc
.LFE0:
.size _Z2f1PcS_S_, .-_Z2f1PcS_S_
.p2align 4,,15
.globl _Z2f2PcS_S_
.type _Z2f2PcS_S_, @function
_Z2f2PcS_S_:
.LFB1:
.cfi_startproc
lea rcx, [rdx+16]
lea rax, [rdi+16]
cmp rdi, rcx
setae r8b
cmp rdx, rax
setae cl
or cl, r8b
je .L19
lea rcx, [rsi+16]
cmp rdi, rcx
setae cl
cmp rsi, rax
setae al
or cl, al
je .L19
xor eax, eax
.p2align 4,,10
.p2align 3
.L17:
movdqu xmm0, XMMWORD PTR [rdx+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddb xmm0, xmm1
movdqu XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, 992
jne .L17
add rdi, 992
add rsi, 992
add rdx, 992
mov r8d, 8
.L16:
xor eax, eax
.p2align 4,,10
.p2align 3
.L18:
movzx ecx, BYTE PTR [rdx+rax]
add cl, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], cl
add rax, 1
cmp rax, r8
jne .L18
rep
ret
.L19:
mov r8d, 1000
jmp .L16
.cfi_endproc
.LFE1:
.size _Z2f2PcS_S_, .-_Z2f2PcS_S_
.ident "GCC: (GNU) 4.7.2"
.section .note.GNU-stack,"",@progbits
我很菜于汇编语言的阅读,因此我决定添加一些标记来知道循环的主体部分所在:
constexpr unsigned N = 1000;
void f1(char* sum, char* a, char* b) {
for(int i = 0; i < N; ++i) {
asm("# im in ur loop");
sum[i] = a[i] + b[i];
}
}
void f2(char* sum, char* a, char* b) {
char* end = sum + N;
while(sum != end) {
asm("# im in ur loop");
*sum++ = *a++ + *b++;
}
}
而GCC输出了这个:
.file "a.c++"
.intel_syntax noprefix
.text
.p2align 4,,15
.globl _Z2f1PcS_S_
.type _Z2f1PcS_S_, @function
_Z2f1PcS_S_:
.LFB0:
.cfi_startproc
xor eax, eax
.p2align 4,,10
.p2align 3
.L2:
#APP
# 4 "a.c++" 1
# im in ur loop
# 0 "" 2
#NO_APP
movzx ecx, BYTE PTR [rdx+rax]
add cl, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], cl
add rax, 1
cmp rax, 1000
jne .L2
rep
ret
.cfi_endproc
.LFE0:
.size _Z2f1PcS_S_, .-_Z2f1PcS_S_
.p2align 4,,15
.globl _Z2f2PcS_S_
.type _Z2f2PcS_S_, @function
_Z2f2PcS_S_:
.LFB1:
.cfi_startproc
xor eax, eax
.p2align 4,,10
.p2align 3
.L6:
#APP
# 12 "a.c++" 1
# im in ur loop
# 0 "" 2
#NO_APP
movzx ecx, BYTE PTR [rdx+rax]
add cl, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], cl
add rax, 1
cmp rax, 1000
jne .L6
rep
ret
.cfi_endproc
.LFE1:
.size _Z2f2PcS_S_, .-_Z2f2PcS_S_
.ident "GCC: (GNU) 4.7.2"
.section .note.GNU-stack,"",@progbits
这段代码要短得多,并且有一些显著的区别,比如缺少SIMD指令。我希望的输出结果是相同的,在其中某个位置有一些注释。我在做出了一些错误的假设吗?是不是由于汇编注释,GCC优化器受到了阻碍?
asm
形式,其中输出和破坏列表为空。 - Kerrek SBasm("# im in ur loop" : : );
可以翻译为:汇编语句asm("# im in ur loop" : : );
,意思是“我在你的循环里面”,更多信息请参考文档。 - Mike Seymour