Simplification to div/mod nodes (remove comparison against -1)
Summary
This merge request removes the comparison against -1 when processing div/mod nodes, which ultimately causes slowdown due to the additional comparison and conditional branch when the divisor in question is comparatively uncommon. The issue of downsizing, where min_int
div -1 must return min_int
does not seem to trigger any more.
An additional commit that's a pure refactor also makes an internal comparison in tmoddivnode.simplify
more efficient.
System
- Processor architecture: All (but fully tested on i386 and x86_64)
What is the current bug behavior?
N/A
What is the behavior after applying this patch?
Expressions that integer divide (including mod) by a variable should now run faster and compile into smaller code. x86_64-win64 and i386-win32 have been tested and found to have no regressions.
Relevant logs and/or screenshots
A simple example in the ZipUtils unit (x86_64-win64 -O4) - before:
.section .text.n_ziputils_$$_fread$pointer$longint$longint$fileptr$$longint,"ax"
...
cmpl $-1,%ebx
jne .Lj24
movl 32(%rsp),%ecx
negl %ecx
movl %ecx,%eax
jmp .Lj27
.p2align 4,,10
.p2align 3
.Lj24:
movl 32(%rsp),%eax
cltd
idivl %ebx
jmp .Lj27
...
.Lj27:
...
After:
.section .text.n_ziputils_$$_fread$pointer$longint$longint$fileptr$$longint,"ax"
...
movl 32(%rsp),%eax
cltd
idivl %ebx
jmp .Lj24
...
.Lj24:
...
A long-winded example in the Graph unit - before:
.Lj14:
...
movq -32(%rbp),%rax
movswl (%rax),%ecx
movq -16(%rbp),%rdx
movswl (%rdx),%eax
movq -24(%rbp),%rdx
movswl (%rdx),%r8d
subl %r8d,%eax
cmpl $-1,%eax
jne .Lj22
movq -8(%rbp),%rax
movswl (%rax),%r9d
movq -32(%rbp),%rdx
movswl (%rdx),%eax
subl %eax,%r9d
movswl 48(%rbp),%eax
movq -24(%rbp),%r8
movswl (%r8),%edx
subl %edx,%eax
imull %eax,%r9d
movl %r9d,%r8d
negl %r8d
jmp .Lj23
.p2align 4,,10
.p2align 3
.Lj22:```
movq -8(%rbp),%rdx
movswl (%rdx),%eax
movq -32(%rbp),%r9
movswl (%r9),%edx
subl %edx,%eax
movswl 48(%rbp),%r9d
movq -24(%rbp),%r10
movswl (%r10),%edx
subl %edx,%r9d
imull %r9d,%eax
movq -16(%rbp),%r10
movswl (%r10),%r9d
subl %edx,%r9d
cltd
idivl %r9d
movl %eax,%r8d
.Lj23:
leal (%ecx,%r8d),%eax
movw %ax,%r12w
movw 48(%rbp),%r13w
jmp .Lj24
.p2align 4,,10
.p2align 3
.Lj20:
...
After
.Lj14:
...
movq -16(%rbp),%rdx
movswl (%rdx),%ecx
movq -32(%rbp),%rdx
movswl (%rdx),%eax
subl %ecx,%eax
movswl 48(%rbp),%r9d
movq -8(%rbp),%r8
movswl (%r8),%edx
subl %edx,%r9d
imull %r9d,%eax
movq -24(%rbp),%r9
movswl (%r9),%r8d
subl %edx,%r8d
cltd
idivl %r8d
addl %ecx,%eax
movw %ax,%r12w
movw 48(%rbp),%r13w
jmp .Lj21 ; <-- Same destination as .Lj24 above.
.p2align 4,,10
.p2align 3
.Lj20:
...