-Sv generates scalar instructions.

On x86-64/win64, BatchMul_SSE_Sv does not work without -CfAVX(2). Disassembly (and observing output) shows that code that multiplies 4-vectors (__m128 * __m128) is compiled into scalar multiplication (mulss) instead of vector (mulps).

Intrinsic version works, but is less desirable not because they are undocumented but because -CfAVX generates VEX-encoded vmulps while intrinsic remains mulps.

{$mode objfpc} {$longstrings on} {$typedaddress on}
type
	pm128 = ^__m128;

	// r[i] := a[i] * m
	procedure BatchMul_Plain(a: pSingle; const m: single; n: SizeUint; r: pSingle);
	var
		ae: pSingle;
	begin
		ae := a + n;
		while a < ae do
		begin
			r^ := a^ * m;
			a += 1; r += 1;
		end;
	end;

	procedure BatchMul_SSE_Sv(a: pSingle; const m: single; n: SizeUint; r: pSingle);
	var
		aEnd4: pSingle;
		tm, tmStore: __m128;
	begin
		if n >= 4 then
		begin
			aEnd4 := a + (n - n mod 4);
			// tm := movss(m); tm := shufps(tm, tm, %00000000);
			tmStore[0] := m; tmStore[1] := m; tmStore[2] := m; tmStore[3] := m; tm := (@tmStore)^;
			repeat
				pm128(r)^ := pm128(a)^ * tm;
				a += 4; r += 4;
			until a = aEnd4;
			if n mod 4 <> 0 then BatchMul_Plain(a, m, n mod 4, r);
		end else
			BatchMul_Plain(a, m, n, r);
	end;

	function mulps(r0, r1: __m128): __m128; internproc: fpc_in_x86_mulps;

	procedure BatchMul_SSE_intrin(a: pSingle; const m: single; n: SizeUint; r: pSingle);
	var
		aEnd4: pSingle;
		tm, tmStore: __m128;
	begin
		if n >= 4 then
		begin
			aEnd4 := a + (n - n mod 4);
			// tm := movss(m); tm := shufps(tm, tm, %00000000);
			tmStore[0] := m; tmStore[1] := m; tmStore[2] := m; tmStore[3] := m; tm := (@tmStore)^;
			repeat
				pm128(r)^ := mulps(pm128(a)^, tm);
				a += 4; r += 4;
			until a = aEnd4;
			if n mod 4 <> 0 then BatchMul_Plain(a, m, n mod 4, r);
		end else
			BatchMul_Plain(a, m, n, r);
	end;

type
	BatchMulProc = procedure(a: pSingle; const m: single; n: SizeUint; r: pSingle);

	function ToString(a: pSingle; n: SizeUint): string;
	var
		i: SizeUint;
		sv: string;
	begin
		result := '(';
		i := 0;
		while i < n do
		begin
			if i > 0 then result += ', ';
			WriteStr(sv, a[i]);
			result += sv;
			i += 1;
		end;
		result += ')';
	end;

	procedure Test(batchMul: BatchMulProc; const name: string);
	const
		SrcA: array[0 .. 3] of single = (-1, 2, 3, 4);
		SrcM = 3;
		ExpectR: array[0 .. High(SrcA)] of single = (-3, 6, 9, 12);
	var
		r: array[0 .. High(SrcA)] of single;
	begin
		write(name);
		FillChar((@r)^, sizeof(r), 0);
		batchMul(SrcA, SrcM, length(SrcA), r);
		if CompareByte(r, ExpectR, sizeof(r)) = 0 then
			writeln(' - ok')
		else
			writeln(' - FAIL: ' + LineEnding +
				'A =      ' + ToString(SrcA, length(SrcA)) + ',' + LineEnding +
				'got      ' + ToString(r, length(r)) + ',' + LineEnding +
				'expected ' + ToString(ExpectR, length(ExpectR)) + '.' + LineEnding);
	end;

begin
	Test(@BatchMul_Plain, 'Plain');
	Test(@BatchMul_SSE_Sv, 'SSE using syntax');
	Test(@BatchMul_SSE_intrin, 'SSE using intrinsics');
end.

Moreover, replacing tm := (@tmStore)^; with tm := tmStore makes even intrinsic version fail on x86-32/win32 with -O2 or greater, but this is probably the separate issue. Disassembly (and observing output) shows that tm := tmStore operation is discarded but the main loop thinks it was not.

Edited Mar 06, 2022 by Rika

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information