So I decide to do a small benchmark on floating point operations and notice it's performance is worse when compile for x86_64 target.
Specs:
OS: Windows 10
CPU: Intel Core i5-9400F
FPC version: 3.2.0-r45643
command line (x64): fpc -Twin64 -Px86_64 -CfSSE42 -CpCOREI -O4 -Sv test.pas
command line (x86): fpc -CfSSE42 -CpCOREI -O4 -Sv test.pas
Code:
{$mode objfpc}
{$asmmode intel}
{$assertions on}
uses
SysUtils;
type
TVector3 = record
X, Y, Z: Single;
end;
function Mult(const V1, V2: TVector3): TVector3;
begin
Result.X := V1.X * V2.X;
Result.Y := V1.Y * V2.Y;
Result.Z := V1.Z * V2.Z;
end;
function Mult_SSE(constref V1, V2: TVector3): TVector3; assembler; nostackframe;
asm
movhps xmm0,qword ptr [V1 + 4]
movlps xmm0,qword ptr [V1]
movhps xmm1,qword ptr [V2 + 4]
movlps xmm1,qword ptr [V2]
mulps xmm0,xmm1
movhps qword ptr [Result + 4],xmm0
movlps qword ptr [Result],xmm0
end;
var
V, V1, V2: TVector3;
I, Tick: Integer;
begin
V1.X := 1; V1.Y := 2; V1.Z := 3;
V2.X := 4; V2.Y := 5; V2.Z := 6;
Write('fpc: ');
Tick := GetTickCount64;
for I := 0 to 99999999 do
V := Mult(V1, V2);
assert(V.X = 1*4);
assert(V.Y = 2*5);
assert(V.Z = 3*6);
Writeln(GetTickCount64 - Tick, 'ms');
Write('Hand code: ');
Tick := GetTickCount64;
for I := 0 to 99999999 do
V := Mult_SSE(V1, V2);
Writeln(GetTickCount64 - Tick, 'ms');
assert(V.X = 1*4);
assert(V.Y = 2*5);
assert(V.Z = 3*6);
end.
Result:
- fpc (x86): ~203ms
- fpc (x64): ~500ms
- Hand code (x64): ~156ms
Notice the x86 fpc result is quite close to hand code SSE2 version of Mult function, whenever the x64 fpc result is, well

Edit: After looking at the asm output, x86 and x64 generate the same assembly code. Further more if I convert TVector3 to TVector4, then x86 and x64 have the same speed. So I guess it's more about memory access performance than the code itself.