x86: Expanded MM block move to include YMM registers under AVX
This merge request expands the XMM block move feature to use YMM registers if they're available.
Criteria
Compile under -Cfavx -OpCOREAVX -CpCOREAVX and confirm correct compilation
Notes
Currently, the RTL can't take advantage of this extension, and there's only one example in the compiler in TRealConstNode.GetCopy, so it might provide a token speed boost - before:
.section .text.n_ncon$_$trealconstnode_$__$$_dogetcopy$$tnode,"ax"
.balign 16,0x90
.globl NCON$_$TREALCONSTNODE_$__$$_DOGETCOPY$$TNODE
NCON$_$TREALCONSTNODE_$__$$_DOGETCOPY$$TNODE:
.seh_proc NCON$_$TREALCONSTNODE_$__$$_DOGETCOPY$$TNODE
pushq %rbx
.seh_pushreg %rbx
leaq -32(%rsp),%rsp
.seh_stackalloc 32
.seh_endprologue
movq %rcx,%rbx
# Peephole Optimization: %rbx = %rcx; changed to minimise pipeline stall (MovXXX2MovXXX)
# Peephole Optimization: Mov2Nop 4 done
call NODE$_$TNODE_$__$$_DOGETCOPY$$TNODE
# Peephole Optimization: Used %xmm0 to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 1)
vmovdqu 144(%rbx),%xmm0
vmovdqu %xmm0,144(%rax)
# Peephole Optimization: Used %xmm0 to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 1)
vmovdqu 160(%rbx),%xmm0
vmovdqu %xmm0,160(%rax)
nop
leaq 32(%rsp),%rsp
popq %rbx
ret
.seh_endproc
After:
.section .text.n_ncon$_$trealconstnode_$__$$_dogetcopy$$tnode,"ax"
.balign 16,0x90
.globl NCON$_$TREALCONSTNODE_$__$$_DOGETCOPY$$TNODE
NCON$_$TREALCONSTNODE_$__$$_DOGETCOPY$$TNODE:
.seh_proc NCON$_$TREALCONSTNODE_$__$$_DOGETCOPY$$TNODE
pushq %rbx
.seh_pushreg %rbx
leaq -32(%rsp),%rsp
.seh_stackalloc 32
.seh_endprologue
movq %rcx,%rbx
# Peephole Optimization: %rbx = %rcx; changed to minimise pipeline stall (MovXXX2MovXXX)
# Peephole Optimization: Mov2Nop 4 done
call NODE$_$TNODE_$__$$_DOGETCOPY$$TNODE
# Peephole Optimization: Used %xmm0 to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 1)
# Peephole Optimization: Used %ymm0 to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 1)
vmovdqu 144(%rbx),%ymm0
vmovdqu %ymm0,144(%rax)
# Peephole Optimization: Used %xmm0 to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 1)
nop
leaq 32(%rsp),%rsp
popq %rbx
ret
.seh_endproc