1. for fpc version you use
function Mult(const V1, V2: TVector3): TVector3;
it will give better results
function Mult(constref V1, V2: TVector3): TVector3;
2. reason for bad results for 64bit pascal implementation is the way how parameters are passed and result returned.
3. I did some testing my self
os : linux 64bit
{$mode objfpc}
{$asmmode intel}
{$assertions on}
uses
SysUtils;
type
TVector3 = record
X, Y, Z: Single;
end;
function Mult(constref V1, V2: TVector3): TVector3;
begin
Result.X := V1.X * V2.X;
Result.Y := V1.Y * V2.Y;
Result.Z := V1.Z * V2.Z;
end;
procedure MultPr(constref V1, V2: TVector3; var Result: TVector3);
begin
Result.X := V1.X * V2.X;
Result.Y := V1.Y * V2.Y;
Result.Z := V1.Z * V2.Z;
end;
function Mult_SSE(constref V1, V2: TVector3): TVector3; assembler; nostackframe;
asm
movdqu xmm0, [v1]
movdqu xmm1, [v2]
mulps xmm0 , xmm1
movdqu xmm1, xmm0
psrldq xmm1,8
end;
procedure Mult_SSEPr(constref V1, V2: TVector3; var Result: TVector3); assembler; nostackframe;
asm
{$if sizeof(TVector3)=12}
movq xmm0, [v1]
movq xmm1, [v2]
mulps xmm0 , xmm1
movq [Result],xmm0
movss xmm0, [v1+8]
movss xmm1, [v2+8]
mulss xmm0 , xmm1
movss [Result+8],xmm0
{$else}
{$if sizeof(TVector3)=16}
movdqu xmm0, [v1]
movdqu xmm1, [v2]
mulps xmm0 , xmm1
movdqu [Result],xmm0
{$else}
{$fatal sizeof(TVector3) has to be 12 or 16 bytes }
{$endif}
{$endif}
end;
var
V, V1, V2: TVector3;
I, Tick: Integer;
begin
writeln('Size of TVector3 ', sizeof(TVector3),' bytes');
V1.X := 1; V1.Y := 2; V1.Z := 3;
V2.X := 4; V2.Y := 5; V2.Z := 6;
Write('fpc function: ');
Tick := GetTickCount64;
for I := 0 to 99999999 do
V := Mult(V1, V2);
Writeln(GetTickCount64 - Tick, 'ms');
assert(V.X = 1*4);
assert(V.Y = 2*5);
assert(V.Z = 3*6);
write ('fpc procedure: ');
Tick := GetTickCount64;
for I := 0 to 99999999 do
MultPr(V1, V2, V);
Writeln(GetTickCount64 - Tick, 'ms');
assert(V.X = 1*4);
assert(V.Y = 2*5);
assert(V.Z = 3*6);
Write('Hand code function: ');
Tick := GetTickCount64;
for I := 0 to 99999999 do
V := Mult_SSE(V1, V2);
Writeln(GetTickCount64 - Tick, 'ms');
assert(V.X = 1*4);
assert(V.Y = 2*5);
assert(V.Z = 3*6);
Write('Hand code procedure: ');
Tick := GetTickCount64;
for I := 0 to 99999999 do
Mult_SSEPr(V1, V2, v);
Writeln(GetTickCount64 - Tick, 'ms');
assert(V.X = 1*4);
assert(V.Y = 2*5);
assert(V.Z = 3*6);
end.
output:
Size of TVector3 12 bytes
fpc function: 817ms
fpc procedure: 128ms
Hand code function: 185ms
Hand code procedure: 139ms
and for TVector3 of 4 Single
type
TVector3 = record
X, Y, Z, A: Single;
end;
output:
Size of TVector3 16 bytes
fpc function: 1164ms
fpc procedure: 161ms
Hand code function: 945ms
Hand code procedure: 116ms
In my tests function has way worse results if TVector was 4 singles instead of 3.