Improving mul and div for powers of 2

Summary

As of now, FPC optimizes unsigned values div and mul when the right side is a power of 2, eg I div 2 will be changed to I shr 1. But it does not do the optimization for signed values as they may be negative. But as in almost all cases of loops in Pascal, we use signed integers, it can be very helpful, as FPC can analyze the data flow and consider it when the value is always positive, so it can be changed to shl or shr and gain almost 2x speedup.

Example Project

Result in: Lazarus 2.3.0 (rev main-2_3-1428-g9f142eef14) FPC 3.3.1 x86_64-win64-win32/win64 without optimization

program project1;
var
  I, V: Cardinal;
begin
  I := 0;
  V := I shr 1;
  V := I div 2;
end.           
# [7] V := I shr 1;
	shrl	$1,%eax
	movl	%eax,U_$P$PROJECT1_$$_V(%rip)
.Ll4:
# [8] V := I div 2;
	movl	U_$P$PROJECT1_$$_I(%rip),%eax
	shrl	$1,%eax
	movl	%eax,U_$P$PROJECT1_$$_V(%rip)

Same instructions.

Change the type to Integer:

program project1;

var
  I, V: Integer;
begin
  I := 0;
  V := I shr 1;
  V := I div 2;
end.          
# [7] V := I shr 1;
	shrl	$1,%eax
	movl	%eax,U_$P$PROJECT1_$$_V(%rip)
.Ll4:
# [8] V := I div 2;
	movslq	U_$P$PROJECT1_$$_I(%rip),%rax
	movq	%rax,%rdx
	shrq	$63,%rdx
	addq	%rdx,%rax
	sarq	$1,%rax
	movl	%eax,U_$P$PROJECT1_$$_V(%rip)

I understand this, compiler thinks that as the value is integer, it may be negative so it does not use the same simple shr.

Now for this code and even with -O4 the results will be:

program Project1;

uses
  SysUtils;

  procedure TestDiv;
  var
    I, V: Integer;
  begin
    for I := 1 to 1000 * 1000 * 1000 do
      V := I div 2;
    WriteLn(V);
  end;

  procedure TestSHR;
  var
    I, V: Integer;
  begin
    for I := 1 to 1000 * 1000 * 1000 do
      V := I shr 1;
    WriteLn(V);
  end;

var
  T: QWord;
begin
  T := GetTickCount64;
  TestDiv;
  WriteLn('div: ', GetTickCount64 - T);

  T := GetTickCount64;
  TestSHR;
  WriteLn('shr: ', GetTickCount64 - T);
  ReadLn;
end.
# [11] V := I div 2;
	movslq	%eax,%rdx
	movq	%rdx,%rcx
	shrq	$63,%rcx
	addq	%rcx,%rdx
	sarq	$1,%rdx
	movl	%edx,%ebx
	addl	$1,%eax
	cmpl	$1000000000,%eax
	jng	.Lj5
# [20] V := I shr 1;
	movl	%eax,%ebx
	shrl	$1,%ebx
	addl	$1,%eax
	cmpl	$1000000000,%eax
	jng	.Lj10

and the times are: div: 469 shr: 218

and if I change integer to cardinal:

# [11] V := I div 2;
	movl	%eax,%ebx
	shrl	$1,%ebx
	addl	$1,%eax
	cmpl	$1000000000,%eax
	jna	.Lj5
# [20] V := I shr 1;
	movl	%eax,%ebx
	shrl	$1,%ebx
	addl	$1,%eax
	cmpl	$1000000000,%eax
	jna	.Lj10

And the times: div: 219 shr: 219

Relevant 3rd party information

https://forum.lazarus.freepascal.org/index.php?topic=58705.new;topicseen#new