Skip to content

[ARM / AArch64] Merging small constants written to the stack into larger ones

Summary

This merge request is an AArch64 counterpart to !97 (merged)

This merge request looks for adjacent, sequential writes of constants to the stack (accepts the stack pointer or the procedure's frame pointer, if different) and attempts to merge them to reduce instruction count and memory latency. Specifically, it will merge 2 adjacent bytes into a half-word, 2 adjacent half-words into a word, and, for AArch64 only, 2 adjacent words into an extended word.

Note that these merges are only performed if the first address is aligned to the new size.

The code also allows the optimisation to be performed under ARMV7A on the ARM platform, as this revision introduces the MOVW and MOVT instructions that allow better encoding of 32-bit constants.

System

  • Operating system: Linux (Raspberry Pi OS) and others
  • Processor architecture: ARM, AArch64
  • Device: Raspberry Pi and others

What is the current bug behavior?

N/A

What is the behavior after applying this patch?

Sequential writing of small constants to the stack are merged into larger constants.

Relevant logs and/or screenshots

Simple example in AArch64's aasmcnst under -O4 (which shows how the compiler handles the zero register in these merges) - before:

.Lj667:
	strb	wzr,[sp, #16]
	movz	w0,#1
	strb	w0,[sp, #17]
        ...

After:

.Lj667:
	movz	w0,#256
	strh	w0,[sp, #16]
        ...

A big example in cgcpu - though the instruction counts are similar, MOVZ/MOVK and MOVK/MOVK pairs undergo macro-fusion and can be executed in a single clock cycle, so there is a large speed increase - before:

.section .text.n_cgcpu$_$tcgaarch64_$__$$_init_register_allocators,"ax"
	.balign 8
.globl	CGCPU$_$TCGAARCH64_$__$$_INIT_REGISTER_ALLOCATORS
	.type	CGCPU$_$TCGAARCH64_$__$$_INIT_REGISTER_ALLOCATORS,@function
CGCPU$_$TCGAARCH64_$__$$_INIT_REGISTER_ALLOCATORS:
	stp	x29,x30,[sp, #-16]!
	mov	x29,sp
	stp	x19,x19,[sp, #-16]!
	sub	sp,sp,#128
	mov	x19,x0
	bl	CGOBJ$_$TCG_$__$$_INIT_REGISTER_ALLOCATORS
	strh	wzr,[sp]
	movz	w0,#1
	strh	w0,[sp, #2]
	movz	w0,#2
	strh	w0,[sp, #4]
	movz	w0,#3
	strh	w0,[sp, #6]
	movz	w0,#4
	strh	w0,[sp, #8]
	movz	w0,#5
	strh	w0,[sp, #10]
	movz	w0,#6
	strh	w0,[sp, #12]
	movz	w0,#7
	strh	w0,[sp, #14]
	movz	w0,#8
	strh	w0,[sp, #16]
	movz	w0,#9
	strh	w0,[sp, #18]
	movz	w0,#10
	strh	w0,[sp, #20]
	movz	w0,#11
	strh	w0,[sp, #22]
	movz	w0,#12
	strh	w0,[sp, #24]
	movz	w0,#13
	strh	w0,[sp, #26]
	movz	w0,#14
	strh	w0,[sp, #28]
	movz	w0,#15
	strh	w0,[sp, #30]
	movz	w0,#16
	strh	w0,[sp, #32]
	movz	w0,#17
	strh	w0,[sp, #34]
	movz	w0,#19
	strh	w0,[sp, #36]
	movz	w0,#20
	strh	w0,[sp, #38]
	movz	w0,#21
	strh	w0,[sp, #40]
	movz	w0,#22
	strh	w0,[sp, #42]
	movz	w0,#23
	strh	w0,[sp, #44]
	movz	w0,#24
	strh	w0,[sp, #46]
	movz	w0,#25
	strh	w0,[sp, #48]
	movz	w0,#26
	strh	w0,[sp, #50]
	movz	w0,#27
	strh	w0,[sp, #52]
	movz	w0,#28
	strh	w0,[sp, #54]
	...

After:

.section .text.n_cgcpu$_$tcgaarch64_$__$$_init_register_allocators,"ax"
	.balign 8
.globl	CGCPU$_$TCGAARCH64_$__$$_INIT_REGISTER_ALLOCATORS
	.type	CGCPU$_$TCGAARCH64_$__$$_INIT_REGISTER_ALLOCATORS,@function
CGCPU$_$TCGAARCH64_$__$$_INIT_REGISTER_ALLOCATORS:
	stp	x29,x30,[sp, #-16]!
	mov	x29,sp
	stp	x19,x19,[sp, #-16]!
	sub	sp,sp,#128
	mov	x19,x0
	bl	CGOBJ$_$TCG_$__$$_INIT_REGISTER_ALLOCATORS
	movz	x0,#0
	movk	x0,#1,lsl #16
	movk	x0,#2,lsl #32
	movk	x0,#3,lsl #48
	str	x0,[sp]
	movz	x0,#4
	movk	x0,#5,lsl #16
	movk	x0,#6,lsl #32
	movk	x0,#7,lsl #48
	str	x0,[sp, #8]
	movz	x0,#8
	movk	x0,#9,lsl #16
	movk	x0,#10,lsl #32
	movk	x0,#11,lsl #48
	str	x0,[sp, #16]
	movz	x0,#12
	movk	x0,#13,lsl #16
	movk	x0,#14,lsl #32
	movk	x0,#15,lsl #48
	str	x0,[sp, #24]
	movz	x0,#16
	movk	x0,#17,lsl #16
	movk	x0,#19,lsl #32
	movk	x0,#20,lsl #48
	str	x0,[sp, #32]
	movz	x0,#21
	movk	x0,#22,lsl #16
	movk	x0,#23,lsl #32
	movk	x0,#24,lsl #48
	str	x0,[sp, #40]
	movz	x0,#25
	movk	x0,#26,lsl #16
	movk	x0,#27,lsl #32
	movk	x0,#28,lsl #48
	str	x0,[sp, #48]
	...

Example in defutil that merges two zero registers (and a zero register and non-zero constant write above it) - before:

.Lj77:
	strb	wzr,[sp]
	movz	w0,#1
	strb	w0,[sp, #1]
	str	xzr,[sp, #8]
	ldp	x2,x3,[sp]
	stp	x19,x20,[sp, #16]
	ldp	x0,x1,[sp, #16]
	bl	CONSTEXP_$$_$greater_or_equal$TCONSTEXPRINT$TCONSTEXPRINT$$BOOLEAN
	uxtb	w0,w0
	cbz	w0,.Lj83
	strb	wzr,[sp]
	strb	wzr,[sp, #1]
        ...

After:

.Lj77:
	movz	w0,#256
	strh	w0,[sp]
	str	xzr,[sp, #8]
	ldp	x2,x3,[sp]
	stp	x19,x20,[sp, #16]
	ldp	x0,x1,[sp, #16]
	bl	CONSTEXP_$$_$greater_or_equal$TCONSTEXPRINT$TCONSTEXPRINT$$BOOLEAN
	uxtb	w0,w0
	cbz	w0,.Lj83
	strh	wzr,[sp]
	...

In Sysutils, the optimisation ends up replacing a slower STP and STR pair with two STR instructions - before:

.Lj6360:
	stp	wzr,wzr,[sp, #36]
	str	wzr,[sp, #44]
        ...

After (will run a cycle faster):

.Lj6360:
	str	wzr,[sp, #36]
	str	xzr,[sp, #40]
        ...
Edited by J. Gareth "Kit" Moreton

Merge request reports