Skip to content

[Cross-platform] Constant propagation now evaluates real-valued constants

Summary

This merge adds a missing feature (a single line of code) to constant-propagation that now permits it to propagate floating-point constants.

System

  • Processor architecture: Cross-platform

What is the current bug behavior?

Constant propagation does not propagate floating-point constants.

What is the behavior after applying this patch?

Floating-point constants are now propagated

Relevant logs and/or screenshots

To start with a very simple example - in the spe unit (x86_64-win64, -O4 -OoCONSTPROP -OoDFA) - before:

.section .text.n_spe_$$_speerf$double$$double,"ax"
	...
	xorpd	%xmm1,%xmm1
	movl	$16,%eax
	movapd	%xmm8,%xmm9
	mulsd	%xmm2,%xmm9
	subsd	%xmm1,%xmm9
	addsd	TC_$SPE$_$SPEERF$DOUBLE$$DOUBLE_$$_D+120(%rip),%xmm9
	...
.Lj171:

After - thanks to constant propagation, the compiler now knows that %xmm1 contains zero and so the subtraction is a null operation:

.section .text.n_spe_$$_speerf$double$$double,"ax"
	...
	xorpd	%xmm1,%xmm1
	movl	$16,%eax
	movapd	%xmm8,%xmm9
	mulsd	%xmm2,%xmm9
	addsd	TC_$SPE$_$SPEERF$DOUBLE$$DOUBLE_$$_D+120(%rip),%xmm9
	...
.Lj171:
	...

In the eigh2 unit - before:

.section .text.n_eigh2_$$_balance$double$longint$longint$longint$longint$double,"ax"
	...
	movsd	_$EIGH2$_Ld5(%rip),%xmm8
	movapd	%xmm8,%xmm9
	mulsd	%xmm9,%xmm9

After - though the registers are applied differently, the square of _$EIGH2$_Ld5(%rip) is now precalculated and stored directly, thus breaking the dependency chain:

.section .text.n_eigh2_$$_balance$double$longint$longint$longint$longint$double,"ax"
	...
	movsd	_$EIGH2$_Ld5(%rip),%xmm9
	movsd	_$EIGH2$_Ld6(%rip),%xmm10

The fppdf unit receives a massive saving - before:

.section .text.n_fppdf$_$tpdfutf8text_$_write$tstream_$$_fin$000003b4,"ax"
	...
	call	MATH_$$_TAN$DOUBLE$$DOUBLE
	cvtsd2ss	%xmm0,%xmm0
	movss	%xmm0,-112(%rbp)
	movss	%xmm8,-116(%rbp)
	movss	-104(%rbp),%xmm1
	mulss	-88(%rbp),%xmm1
	movss	-108(%rbp),%xmm0
	mulss	-96(%rbp),%xmm0
	addss	%xmm1,%xmm0
	movss	%xmm0,-88(%rbp)
	movss	-104(%rbp),%xmm1
	mulss	-92(%rbp),%xmm1
	movss	-108(%rbp),%xmm0
	mulss	-100(%rbp),%xmm0
	addss	%xmm1,%xmm0
	movss	%xmm0,-92(%rbp)
	movss	-112(%rbp),%xmm1
	mulss	-88(%rbp),%xmm1
	movss	-116(%rbp),%xmm0
	mulss	-96(%rbp),%xmm0
	addss	%xmm1,%xmm0
	movss	%xmm0,-96(%rbp)
	movss	-112(%rbp),%xmm1
	mulss	-92(%rbp),%xmm1
	movss	-116(%rbp),%xmm0
	mulss	-100(%rbp),%xmm0
	addss	%xmm1,%xmm0
	movss	%xmm0,-100(%rbp)
.Lj2092:
	...

After - it seems to be able to successfully reuse values already stored on the stack:

.section .text.n_fppdf$_$tpdfutf8text_$_write$tstream_$$_fin$000003b4,"ax"
	...
	call	MATH_$$_TAN$DOUBLE$$DOUBLE
	cvtsd2ss	%xmm0,%xmm0
	movss	%xmm0,-112(%rbp)
	movss	%xmm8,-116(%rbp)
	movl	-92(%rbp),%eax
	movss	-112(%rbp),%xmm0
	mulss	-88(%rbp),%xmm0
	addss	-96(%rbp),%xmm0
	movss	%xmm0,-96(%rbp)
	movss	-112(%rbp),%xmm0
	mulss	-92(%rbp),%xmm0
	addss	-100(%rbp),%xmm0
	movss	%xmm0,-100(%rbp)
.Lj2092:
	...

A similar saving happens later in the file:

.section .text.n_fppdf$_$tpdfutf16text_$__$$_write$tstream,"ax"
	...
	call	MATH_$$_TAN$DOUBLE$$DOUBLE
	cvtsd2ss	%xmm0,%xmm0
	movss	%xmm0,-144(%rbp)
	movss	%xmm8,-148(%rbp)
	movss	-136(%rbp),%xmm1
	mulss	-120(%rbp),%xmm1
	movss	-140(%rbp),%xmm0
	mulss	-128(%rbp),%xmm0
	addss	%xmm1,%xmm0
	movss	%xmm0,-120(%rbp)
	movss	-136(%rbp),%xmm1
	mulss	-124(%rbp),%xmm1
	movss	-140(%rbp),%xmm0
	mulss	-132(%rbp),%xmm0
	addss	%xmm1,%xmm0
	movss	%xmm0,-124(%rbp)
	movss	-144(%rbp),%xmm1
	mulss	-120(%rbp),%xmm1
	movss	-148(%rbp),%xmm0
	mulss	-128(%rbp),%xmm0
	addss	%xmm1,%xmm0
	movss	%xmm0,-128(%rbp)
	movss	-144(%rbp),%xmm1
	mulss	-124(%rbp),%xmm1
	movss	-148(%rbp),%xmm0
	mulss	-132(%rbp),%xmm0
	addss	%xmm1,%xmm0
	movss	%xmm0,-132(%rbp)
.Lj2172:
	...

After:

.section .text.n_fppdf$_$tpdfutf16text_$__$$_write$tstream,"ax"
	...
	call	MATH_$$_TAN$DOUBLE$$DOUBLE
	cvtsd2ss	%xmm0,%xmm0
	movss	%xmm0,-144(%rbp)
	movss	%xmm8,-148(%rbp)
	movl	-124(%rbp),%eax
	movss	-144(%rbp),%xmm0
	mulss	-120(%rbp),%xmm0
	addss	-128(%rbp),%xmm0
	movss	%xmm0,-128(%rbp)
	movss	-144(%rbp),%xmm0
	mulss	-124(%rbp),%xmm0
	addss	-132(%rbp),%xmm0
	movss	%xmm0,-132(%rbp)
.Lj2172:
	...

Unfortunately it's not perfect - in the fpcolorspace unit - before:

.section .text.n_fpcolorspace$_$txyzahelper_$_fromspectrumrangereflect$single$single$single$single_$$_includewavelength$hys1cvj_eclh,"ax"
	...
	movss	_$FPCOLORSPACE$_Ld1(%rip),%xmm10
	...
	jp	.Lj768
	jna	.Lj768
	...
	mulss	_$FPCOLORSPACE$_Ld110(%rip),%xmm0
	subss	%xmm0,%xmm10

After:

.section .text.n_fpcolorspace$_$txyzahelper_$_fromspectrumrangereflect$single$single$single$single_$$_includewavelength$hys1cvj_eclh,"ax"
	...
	movss	_$FPCOLORSPACE$_Ld1(%rip),%xmm10
	...
	jp	.Lj768
	jna	.Lj768
	...
	mulss	_$FPCOLORSPACE$_Ld110(%rip),%xmm0
	movss	_$FPCOLORSPACE$_Ld1(%rip),%xmm1
	subss	%xmm0,%xmm1
	movaps	%xmm1,%xmm10

In this case, it's because constant propagation conflicts with the peephole optimizer. With the peephole optimizer turned off, this becomes apparent - before:

	...
	movaps	%xmm10,%xmm1
	subss	%xmm0,%xmm1
	movaps	%xmm1,%xmm10
	...

After:

	...
	movss	_$FPCOLORSPACE$_Ld1(%rip),%xmm1
	subss	%xmm0,%xmm1
	movaps	%xmm1,%xmm10
	...

In the former case, a peephole optimisation compresses the three instrutions because %xmm10 is both the input and the output, which cannot be assumed in the latter case. However, if an additional peephole optimisation is able to detect that %xmm10 is equal to _$FPCOLORSPACE$_Ld1(%rip) and determine that replacing it won't cause a pipeline stall, then the original peephole optimisation can take place.

Additional Notes

This feature will be necessary for pure functions (!645) to work properly when dealing with floating-point values, since it uses DFA and constant propagation to analyse the node tree for a given input.

Edited by J. Gareth "Kit" Moreton

Merge request reports