[x86] Better lookahead for long-range OptPass1MOV optimisations
Summary
This merge request improves the long-range optimisations in OptPass1MOV
that permits the removal of more redundant instructions on high optimisation settings. This is performed by looking past instructions that read the target register without modifying it, since it may find, for exmaple, a MOV instruction that writes the same value to the target register (Mov2Nop 2). The compiler will also now look ahead in cases where the initial MOV writes a constant to a 32-bit register, and the distant instructions read from the full 64-bit register... so long as the constant is between 0 and $7FFFFFFF.
Additionally, There are some minor changes elsewhere to help with this functionality too:
-
RegModifiedByInstruction
andRegInInstruction
are now more accurate on IMUL and IDIV instructions. - An
AllocRegBetween
call was missing for the "Mov2Nop 2" optimisation. This created regions where a register was marked as not in use when its value was still live, and may cause problems if another optimisation requests a new temporary register (no examples were noted though). - The cross-jump optimisations now handle Jcc instructions better so as to not automatically assume all registers are in use.
- After the cross-jump optimisations, if
mov %reg1,%reg2
appears, and then no instructions use %reg2 until a label is found, and thenmov %reg1,%reg2
is found right after the label, the first MOV is removed.
System
- Processor architecture: i386, x86_64
What is the current bug behavior?
N/A
What is the behavior after applying this patch?
A large number of redundant MOV instructions get removed, and subsequence optimisation cascades, like jump redirections, ensure more efficient code afterwards.
Relevant logs and/or screenshots
In the System unit under x86_64-win64, -O4, there are quite a few cases where a constant is written to %eax, and then %rax is written to a memory location. Before:
...
.Lj1510:
xorl %eax,%eax
movq %rax,-376(%rbp)
jmp .Lj1503
.balign 16,0x90
.Lj1511:
movl $1,%eax
movq %rax,-376(%rbp)
jmp .Lj1503
.balign 16,0x90
.Lj1512:
movl $2,%eax
movq %rax,-376(%rbp)
jmp .Lj1503
...
These now get merged. After:
...
.Lj1510:
movq $0,-376(%rbp)
jmp .Lj1503
.balign 16,0x90
.Lj1511:
movq $1,-376(%rbp)
jmp .Lj1503
.balign 16,0x90
.Lj1512:
movq $2,-376(%rbp)
jmp .Lj1503
...
In aasmcpu, the better handling of Jcc instructions permit the removal of a redundant MOV. Before:
...
movl %ecx,%esi
...
je .Lj1982
...
.Lj1982:
...
jne .Lj1985
...
.Lj1985:
movl %esi,%ecx
call CPUBASE_$$_REG2OPSIZE$TREGISTER$$TOPSIZE
...
After (the jumps highlighted in the code are the only references to their target labels):
...
movl %ecx,%esi
...
je .Lj1982
...
.Lj1982:
...
jne .Lj1985
addq $1,48(%rsp)
.Lj1985:
call CPUBASE_$$_REG2OPSIZE$TREGISTER$$TOPSIZE
...
in the same unit, a whole string of writes to the stack get optimised - before:
...
.Lj2028:
movb $0,-688(%rbp)
movq $0,-712(%rbp)
xorl %eax,%eax
movq %rax,-728(%rbp)
movq $0,-736(%rbp)
movq $0,-784(%rbp)
movq $0,-720(%rbp)
movq %rax,-752(%rbp)
movq $0,-696(%rbp)
xorl %eax,%eax
movq %rax,-776(%rbp)
movq $0,-760(%rbp)
movq $0,-704(%rbp)
movq %rax,-744(%rbp)
xorb %r15b,%r15b
movb $0,-792(%rbp)
movb $0,-768(%rbp)
...
After:
...
.Lj2028:
movb $0,-688(%rbp)
movq $0,-712(%rbp)
movq $0,-728(%rbp)
movq $0,-736(%rbp)
movq $0,-784(%rbp)
movq $0,-720(%rbp)
movq $0,-752(%rbp)
movq $0,-696(%rbp)
movq $0,-776(%rbp)
movq $0,-760(%rbp)
movq $0,-704(%rbp)
movq $0,-744(%rbp)
xorb %r15b,%r15b
movb $0,-792(%rbp)
movb $0,-768(%rbp)
...
Going back to the System unit, a very long-range optimisation is made that initially isn't obvious - before:
...
.Lj2028:
...
movl %r8d,%r10d
movslq %r8d,%r8
...
.Lj2032:
...
.Lj2033:
...
.Lj2030:
movslq %r10d,%r10
leaq (%r10,%r10,2),%rdx
...
After a large number of instructions, labels and jumps:
...
.Lj2028:
...
movslq %r8d,%r10
movslq %r8d,%r8
...
.Lj2032:
...
.Lj2033:
...
.Lj2030:
leaq (%r10,%r10,2),%rdx
...
In a slightly amusing example with optloop - before:
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $2,%eax
jbe .Lj22
subl $1,%eax
je .Lj88
subl $1,%eax
jb .Lj22
subl $3,%eax
jbe .Lj89
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $8,%eax
jbe .Lj90
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $16,%eax
jbe .Lj91
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $32,%eax
jbe .Lj92
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $32704,%eax
jbe .Lj93
jmp .Lj22
.balign 16,0x90
.Lj88:
movl $2,%ebx
jmp .Lj22
.balign 16,0x90
.Lj89:
movl $4,%ebx
jmp .Lj22
.balign 16,0x90
.Lj90:
movl $8,%ebx
jmp .Lj22
.balign 16,0x90
.Lj91:
movl $16,%ebx
jmp .Lj22
.balign 16,0x90
.Lj92:
movl $32,%ebx
jmp .Lj22
.balign 16,0x90
.Lj93:
movl $64,%ebx
jmp .Lj22
...
.Lj22:
movq 32(%rsp),%rax
nop
leaq 136(%rsp),%rsp
popq %rbp
.Lc27:
popq %r15
.Lc28:
popq %r14
.Lc29:
popq %r13
.Lc30:
popq %r12
.Lc31:
popq %rsi
.Lc32:
popq %rdi
.Lc33:
popq %rbx
After:
jb .Lj22
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $2,%eax
jbe .Lj22
subl $1,%eax
je .Lj22
subl $1,%eax
jb .Lj22
subl $3,%eax
jbe .Lj22
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $8,%eax
jbe .Lj22
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $16,%eax
jbe .Lj22
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $32,%eax
jbe .Lj22
# Peephole Optimization: SUB; ADD/SUB -> SUB
subl $32704,%eax
# Peephole Optimization: Removed "movl $2,%ebx" as the destination is overwritten after the jump (MovJmp2Jmp)
# Peephole Optimization: Removed "movl $4,%ebx" as the destination is overwritten after the jump (MovJmp2Jmp)
# Peephole Optimization: Removed "movl $8,%ebx" as the destination is overwritten after the jump (MovJmp2Jmp)
# Peephole Optimization: Removed "movl $16,%ebx" as the destination is overwritten after the jump (MovJmp2Jmp)
# Peephole Optimization: Removed "movl $32,%ebx" as the destination is overwritten after the jump (MovJmp2Jmp)
# Peephole Optimization: Removed "movl $64,%ebx" as the destination is overwritten after the jump (MovJmp2Jmp)
jmp .Lj22
...
.Lj22:
movq 32(%rsp),%rax
nop
leaq 136(%rsp),%rsp
popq %rbp
.Lc27:
popq %r15
.Lc28:
popq %r14
.Lc29:
popq %r13
.Lc30:
popq %r12
.Lc31:
popq %rsi
.Lc32:
popq %rdi
.Lc33:
popq %rbx
Initially I thought this was bugged, first because it didn't click that popq %rbx
was the instruction that was overwriting %ebx but it still led me to wonder what %ebx was being set for in the first place. Looking at the source code cleared that up!
...
case unrolls of
1..2:
;
3:
unrolls:=2;
4..7:
unrolls:=4;
{ unrolls>4 already make no sense imo, but who knows (FK) }
8..15:
unrolls:=8;
16..31:
unrolls:=16;
32..63:
unrolls:=32;
64..$7fff:
unrolls:=64;
else
exit;
end;
{ we don't handle this yet }
exit;
end;
Combined with !283 (merged), it removes the repeated constant in the adler unit - before:
.Lj14:
...
movl %r10d,%ebx
movq $281539415969072,%rdx
mulx %rbx,%rsi,%rsi
imull $65521,%esi
subl %esi,%ebx
movl %ebx,%r10d
movl %ecx,%ebx
movq $281539415969072,%rdx
mulx %rbx,%rsi,%rsi
imull $65521,%esi
subl %esi,%ebx
movl %ebx,%ecx
...
After:
.Lj14:
...
movl %r10d,%ebx
movq $281539415969072,%rdx
mulx %rbx,%rsi,%rsi
imull $65521,%esi
subl %esi,%ebx
movl %ebx,%r10d
movl %ecx,%ebx
mulx %rbx,%rsi,%rsi
imull $65521,%esi
subl %esi,%ebx
movl %ebx,%ecx
...