-Sv generates scalar instructions.
On x86-64/win64
, BatchMul_SSE_Sv
does not work without -CfAVX
(2
). Disassembly (and observing output) shows that code that multiplies 4-vectors (__m128 * __m128
) is compiled into scalar multiplication (mulss
) instead of vector (mulps
).
Intrinsic version works, but is less desirable not because they are undocumented but because -CfAVX
generates VEX-encoded vmulps
while intrinsic remains mulps
.
{$mode objfpc} {$longstrings on} {$typedaddress on}
type
pm128 = ^__m128;
// r[i] := a[i] * m
procedure BatchMul_Plain(a: pSingle; const m: single; n: SizeUint; r: pSingle);
var
ae: pSingle;
begin
ae := a + n;
while a < ae do
begin
r^ := a^ * m;
a += 1; r += 1;
end;
end;
procedure BatchMul_SSE_Sv(a: pSingle; const m: single; n: SizeUint; r: pSingle);
var
aEnd4: pSingle;
tm, tmStore: __m128;
begin
if n >= 4 then
begin
aEnd4 := a + (n - n mod 4);
// tm := movss(m); tm := shufps(tm, tm, %00000000);
tmStore[0] := m; tmStore[1] := m; tmStore[2] := m; tmStore[3] := m; tm := (@tmStore)^;
repeat
pm128(r)^ := pm128(a)^ * tm;
a += 4; r += 4;
until a = aEnd4;
if n mod 4 <> 0 then BatchMul_Plain(a, m, n mod 4, r);
end else
BatchMul_Plain(a, m, n, r);
end;
function mulps(r0, r1: __m128): __m128; internproc: fpc_in_x86_mulps;
procedure BatchMul_SSE_intrin(a: pSingle; const m: single; n: SizeUint; r: pSingle);
var
aEnd4: pSingle;
tm, tmStore: __m128;
begin
if n >= 4 then
begin
aEnd4 := a + (n - n mod 4);
// tm := movss(m); tm := shufps(tm, tm, %00000000);
tmStore[0] := m; tmStore[1] := m; tmStore[2] := m; tmStore[3] := m; tm := (@tmStore)^;
repeat
pm128(r)^ := mulps(pm128(a)^, tm);
a += 4; r += 4;
until a = aEnd4;
if n mod 4 <> 0 then BatchMul_Plain(a, m, n mod 4, r);
end else
BatchMul_Plain(a, m, n, r);
end;
type
BatchMulProc = procedure(a: pSingle; const m: single; n: SizeUint; r: pSingle);
function ToString(a: pSingle; n: SizeUint): string;
var
i: SizeUint;
sv: string;
begin
result := '(';
i := 0;
while i < n do
begin
if i > 0 then result += ', ';
WriteStr(sv, a[i]);
result += sv;
i += 1;
end;
result += ')';
end;
procedure Test(batchMul: BatchMulProc; const name: string);
const
SrcA: array[0 .. 3] of single = (-1, 2, 3, 4);
SrcM = 3;
ExpectR: array[0 .. High(SrcA)] of single = (-3, 6, 9, 12);
var
r: array[0 .. High(SrcA)] of single;
begin
write(name);
FillChar((@r)^, sizeof(r), 0);
batchMul(SrcA, SrcM, length(SrcA), r);
if CompareByte(r, ExpectR, sizeof(r)) = 0 then
writeln(' - ok')
else
writeln(' - FAIL: ' + LineEnding +
'A = ' + ToString(SrcA, length(SrcA)) + ',' + LineEnding +
'got ' + ToString(r, length(r)) + ',' + LineEnding +
'expected ' + ToString(ExpectR, length(ExpectR)) + '.' + LineEnding);
end;
begin
Test(@BatchMul_Plain, 'Plain');
Test(@BatchMul_SSE_Sv, 'SSE using syntax');
Test(@BatchMul_SSE_intrin, 'SSE using intrinsics');
end.
Moreover, replacing tm := (@tmStore)^;
with tm := tmStore
makes even intrinsic version fail on x86-32/win32
with -O2
or greater, but this is probably the separate issue. Disassembly (and observing output) shows that tm := tmStore
operation is discarded but the main loop thinks it was not.
Edited by Rika