[Cross-platform] Constant propagation now evaluates real-valued constants
Summary
This merge adds a missing feature (a single line of code) to constant-propagation that now permits it to propagate floating-point constants.
System
- Processor architecture: Cross-platform
What is the current bug behavior?
Constant propagation does not propagate floating-point constants.
What is the behavior after applying this patch?
Floating-point constants are now propagated
Relevant logs and/or screenshots
To start with a very simple example - in the spe
unit (x86_64-win64, -O4 -OoCONSTPROP -OoDFA) - before:
.section .text.n_spe_$$_speerf$double$$double,"ax"
...
xorpd %xmm1,%xmm1
movl $16,%eax
movapd %xmm8,%xmm9
mulsd %xmm2,%xmm9
subsd %xmm1,%xmm9
addsd TC_$SPE$_$SPEERF$DOUBLE$$DOUBLE_$$_D+120(%rip),%xmm9
...
.Lj171:
After - thanks to constant propagation, the compiler now knows that %xmm1
contains zero and so the subtraction is a null operation:
.section .text.n_spe_$$_speerf$double$$double,"ax"
...
xorpd %xmm1,%xmm1
movl $16,%eax
movapd %xmm8,%xmm9
mulsd %xmm2,%xmm9
addsd TC_$SPE$_$SPEERF$DOUBLE$$DOUBLE_$$_D+120(%rip),%xmm9
...
.Lj171:
...
In the eigh2
unit - before:
.section .text.n_eigh2_$$_balance$double$longint$longint$longint$longint$double,"ax"
...
movsd _$EIGH2$_Ld5(%rip),%xmm8
movapd %xmm8,%xmm9
mulsd %xmm9,%xmm9
After - though the registers are applied differently, the square of _$EIGH2$_Ld5(%rip)
is now precalculated and stored directly, thus breaking the dependency chain:
.section .text.n_eigh2_$$_balance$double$longint$longint$longint$longint$double,"ax"
...
movsd _$EIGH2$_Ld5(%rip),%xmm9
movsd _$EIGH2$_Ld6(%rip),%xmm10
The fppdf
unit receives a massive saving - before:
.section .text.n_fppdf$_$tpdfutf8text_$_write$tstream_$$_fin$000003b4,"ax"
...
call MATH_$$_TAN$DOUBLE$$DOUBLE
cvtsd2ss %xmm0,%xmm0
movss %xmm0,-112(%rbp)
movss %xmm8,-116(%rbp)
movss -104(%rbp),%xmm1
mulss -88(%rbp),%xmm1
movss -108(%rbp),%xmm0
mulss -96(%rbp),%xmm0
addss %xmm1,%xmm0
movss %xmm0,-88(%rbp)
movss -104(%rbp),%xmm1
mulss -92(%rbp),%xmm1
movss -108(%rbp),%xmm0
mulss -100(%rbp),%xmm0
addss %xmm1,%xmm0
movss %xmm0,-92(%rbp)
movss -112(%rbp),%xmm1
mulss -88(%rbp),%xmm1
movss -116(%rbp),%xmm0
mulss -96(%rbp),%xmm0
addss %xmm1,%xmm0
movss %xmm0,-96(%rbp)
movss -112(%rbp),%xmm1
mulss -92(%rbp),%xmm1
movss -116(%rbp),%xmm0
mulss -100(%rbp),%xmm0
addss %xmm1,%xmm0
movss %xmm0,-100(%rbp)
.Lj2092:
...
After - it seems to be able to successfully reuse values already stored on the stack:
.section .text.n_fppdf$_$tpdfutf8text_$_write$tstream_$$_fin$000003b4,"ax"
...
call MATH_$$_TAN$DOUBLE$$DOUBLE
cvtsd2ss %xmm0,%xmm0
movss %xmm0,-112(%rbp)
movss %xmm8,-116(%rbp)
movl -92(%rbp),%eax
movss -112(%rbp),%xmm0
mulss -88(%rbp),%xmm0
addss -96(%rbp),%xmm0
movss %xmm0,-96(%rbp)
movss -112(%rbp),%xmm0
mulss -92(%rbp),%xmm0
addss -100(%rbp),%xmm0
movss %xmm0,-100(%rbp)
.Lj2092:
...
A similar saving happens later in the file:
.section .text.n_fppdf$_$tpdfutf16text_$__$$_write$tstream,"ax"
...
call MATH_$$_TAN$DOUBLE$$DOUBLE
cvtsd2ss %xmm0,%xmm0
movss %xmm0,-144(%rbp)
movss %xmm8,-148(%rbp)
movss -136(%rbp),%xmm1
mulss -120(%rbp),%xmm1
movss -140(%rbp),%xmm0
mulss -128(%rbp),%xmm0
addss %xmm1,%xmm0
movss %xmm0,-120(%rbp)
movss -136(%rbp),%xmm1
mulss -124(%rbp),%xmm1
movss -140(%rbp),%xmm0
mulss -132(%rbp),%xmm0
addss %xmm1,%xmm0
movss %xmm0,-124(%rbp)
movss -144(%rbp),%xmm1
mulss -120(%rbp),%xmm1
movss -148(%rbp),%xmm0
mulss -128(%rbp),%xmm0
addss %xmm1,%xmm0
movss %xmm0,-128(%rbp)
movss -144(%rbp),%xmm1
mulss -124(%rbp),%xmm1
movss -148(%rbp),%xmm0
mulss -132(%rbp),%xmm0
addss %xmm1,%xmm0
movss %xmm0,-132(%rbp)
.Lj2172:
...
After:
.section .text.n_fppdf$_$tpdfutf16text_$__$$_write$tstream,"ax"
...
call MATH_$$_TAN$DOUBLE$$DOUBLE
cvtsd2ss %xmm0,%xmm0
movss %xmm0,-144(%rbp)
movss %xmm8,-148(%rbp)
movl -124(%rbp),%eax
movss -144(%rbp),%xmm0
mulss -120(%rbp),%xmm0
addss -128(%rbp),%xmm0
movss %xmm0,-128(%rbp)
movss -144(%rbp),%xmm0
mulss -124(%rbp),%xmm0
addss -132(%rbp),%xmm0
movss %xmm0,-132(%rbp)
.Lj2172:
...
Unfortunately it's not perfect - in the fpcolorspace
unit - before:
.section .text.n_fpcolorspace$_$txyzahelper_$_fromspectrumrangereflect$single$single$single$single_$$_includewavelength$hys1cvj_eclh,"ax"
...
movss _$FPCOLORSPACE$_Ld1(%rip),%xmm10
...
jp .Lj768
jna .Lj768
...
mulss _$FPCOLORSPACE$_Ld110(%rip),%xmm0
subss %xmm0,%xmm10
After:
.section .text.n_fpcolorspace$_$txyzahelper_$_fromspectrumrangereflect$single$single$single$single_$$_includewavelength$hys1cvj_eclh,"ax"
...
movss _$FPCOLORSPACE$_Ld1(%rip),%xmm10
...
jp .Lj768
jna .Lj768
...
mulss _$FPCOLORSPACE$_Ld110(%rip),%xmm0
movss _$FPCOLORSPACE$_Ld1(%rip),%xmm1
subss %xmm0,%xmm1
movaps %xmm1,%xmm10
In this case, it's because constant propagation conflicts with the peephole optimizer. With the peephole optimizer turned off, this becomes apparent - before:
...
movaps %xmm10,%xmm1
subss %xmm0,%xmm1
movaps %xmm1,%xmm10
...
After:
...
movss _$FPCOLORSPACE$_Ld1(%rip),%xmm1
subss %xmm0,%xmm1
movaps %xmm1,%xmm10
...
In the former case, a peephole optimisation compresses the three instrutions because %xmm10
is both the input and the output, which cannot be assumed in the latter case. However, if an additional peephole optimisation is able to detect that %xmm10
is equal to _$FPCOLORSPACE$_Ld1(%rip)
and determine that replacing it won't cause a pipeline stall, then the original peephole optimisation can take place.
Additional Notes
This feature will be necessary for pure functions (!645) to work properly when dealing with floating-point values, since it uses DFA and constant propagation to analyse the node tree for a given input.