Forum > FPC development

x64 32bit floating point operations is slower than x86

(1/1)

kagamma:
So I decide to do a small benchmark on floating point operations and notice it's performance is worse when compile for x86_64 target.

Specs:
OS: Windows 10
CPU: Intel Core i5-9400F
FPC version: 3.2.0-r45643

command line (x64): fpc -Twin64 -Px86_64 -CfSSE42 -CpCOREI -O4 -Sv test.pas
command line (x86): fpc -CfSSE42 -CpCOREI -O4 -Sv test.pas

Code:

--- Code: Pascal  [+][-]window.onload = function(){var x1 = document.getElementById("main_content_section"); if (x1) { var x = document.getElementsByClassName("geshi");for (var i = 0; i < x.length; i++) { x[i].style.maxHeight='none'; x[i].style.height = Math.min(x[i].clientHeight+15,306)+'px'; x[i].style.resize = "vertical";}};} ---{$mode objfpc}{$asmmode intel}{$assertions on} uses  SysUtils; type  TVector3 = record    X, Y, Z: Single;  end; function Mult(const V1, V2: TVector3): TVector3;begin  Result.X := V1.X * V2.X;  Result.Y := V1.Y * V2.Y;  Result.Z := V1.Z * V2.Z;end; function Mult_SSE(constref V1, V2: TVector3): TVector3; assembler; nostackframe;asm  movhps xmm0,qword ptr [V1 + 4]  movlps xmm0,qword ptr [V1]  movhps xmm1,qword ptr [V2 + 4]  movlps xmm1,qword ptr [V2]  mulps  xmm0,xmm1  movhps qword ptr [Result + 4],xmm0  movlps qword ptr [Result],xmm0end; var  V, V1, V2: TVector3;  I, Tick: Integer; begin  V1.X := 1; V1.Y := 2; V1.Z := 3;  V2.X := 4; V2.Y := 5; V2.Z := 6;  Write('fpc: ');  Tick := GetTickCount64;  for I := 0 to 99999999 do    V := Mult(V1, V2);  assert(V.X = 1*4);  assert(V.Y = 2*5);  assert(V.Z = 3*6);  Writeln(GetTickCount64 - Tick, 'ms');  Write('Hand code: ');  Tick := GetTickCount64;  for I := 0 to 99999999 do    V := Mult_SSE(V1, V2);  Writeln(GetTickCount64 - Tick, 'ms');  assert(V.X = 1*4);  assert(V.Y = 2*5);  assert(V.Z = 3*6);end.
Result:
- fpc (x86): ~203ms
- fpc (x64): ~500ms
- Hand code (x64): ~156ms

Notice the x86 fpc result is quite close to hand code SSE2 version of Mult function, whenever the x64 fpc result is, well  :-[

Edit: After looking at the asm output, x86 and x64 generate the same assembly code. Further more if I convert TVector3 to TVector4, then x86 and x64 have the same speed. So I guess it's more about memory access performance than the code itself.

mika:
1. for fpc version you use

--- Code: Pascal  [+][-]window.onload = function(){var x1 = document.getElementById("main_content_section"); if (x1) { var x = document.getElementsByClassName("geshi");for (var i = 0; i < x.length; i++) { x[i].style.maxHeight='none'; x[i].style.height = Math.min(x[i].clientHeight+15,306)+'px'; x[i].style.resize = "vertical";}};} ---function Mult(const V1, V2: TVector3): TVector3;it will give better results

--- Code: Pascal  [+][-]window.onload = function(){var x1 = document.getElementById("main_content_section"); if (x1) { var x = document.getElementsByClassName("geshi");for (var i = 0; i < x.length; i++) { x[i].style.maxHeight='none'; x[i].style.height = Math.min(x[i].clientHeight+15,306)+'px'; x[i].style.resize = "vertical";}};} ---function Mult(constref V1, V2: TVector3): TVector3;
2. reason for bad results for 64bit pascal implementation is the way how parameters are passed and result returned.

3. I did some testing my self
os : linux 64bit


--- Code: Pascal  [+][-]window.onload = function(){var x1 = document.getElementById("main_content_section"); if (x1) { var x = document.getElementsByClassName("geshi");for (var i = 0; i < x.length; i++) { x[i].style.maxHeight='none'; x[i].style.height = Math.min(x[i].clientHeight+15,306)+'px'; x[i].style.resize = "vertical";}};} ---    {$mode objfpc}    {$asmmode intel}    {$assertions on}     uses      SysUtils;     type      TVector3 = record        X, Y, Z: Single;      end;     function Mult(constref V1, V2: TVector3): TVector3;    begin       Result.X := V1.X * V2.X;      Result.Y := V1.Y * V2.Y;      Result.Z := V1.Z * V2.Z;    end;     procedure MultPr(constref V1, V2: TVector3; var  Result: TVector3);    begin      Result.X := V1.X * V2.X;      Result.Y := V1.Y * V2.Y;      Result.Z := V1.Z * V2.Z;    end;     function Mult_SSE(constref V1, V2: TVector3): TVector3; assembler; nostackframe;    asm      movdqu xmm0, [v1]      movdqu xmm1, [v2]      mulps  xmm0 , xmm1      movdqu xmm1, xmm0      psrldq xmm1,8    end;     procedure Mult_SSEPr(constref V1, V2: TVector3; var Result: TVector3); assembler; nostackframe;    asm     {$if sizeof(TVector3)=12}       movq xmm0, [v1]      movq xmm1, [v2]      mulps  xmm0 , xmm1      movq [Result],xmm0       movss xmm0, [v1+8]      movss xmm1, [v2+8]      mulss  xmm0 , xmm1      movss [Result+8],xmm0       {$else}      {$if sizeof(TVector3)=16}      movdqu xmm0, [v1]      movdqu xmm1, [v2]      mulps  xmm0 , xmm1      movdqu [Result],xmm0      {$else}          {$fatal  sizeof(TVector3) has to be 12 or 16 bytes }      {$endif}      {$endif}    end;      var      V, V1, V2: TVector3;      I, Tick: Integer;     begin      writeln('Size of TVector3 ', sizeof(TVector3),' bytes');      V1.X := 1; V1.Y := 2; V1.Z := 3;      V2.X := 4; V2.Y := 5; V2.Z := 6;      Write('fpc function: ');      Tick := GetTickCount64;      for I := 0 to 99999999 do        V := Mult(V1, V2);       Writeln(GetTickCount64 - Tick, 'ms');       assert(V.X = 1*4);      assert(V.Y = 2*5);      assert(V.Z = 3*6);       write ('fpc procedure: ');      Tick := GetTickCount64;      for I := 0 to 99999999 do        MultPr(V1, V2, V);       Writeln(GetTickCount64 - Tick, 'ms');       assert(V.X = 1*4);      assert(V.Y = 2*5);      assert(V.Z = 3*6);       Write('Hand code function: ');      Tick := GetTickCount64;      for I := 0 to 99999999 do        V := Mult_SSE(V1, V2);      Writeln(GetTickCount64 - Tick, 'ms');       assert(V.X = 1*4);      assert(V.Y = 2*5);      assert(V.Z = 3*6);       Write('Hand code procedure: ');      Tick := GetTickCount64;      for I := 0 to 99999999 do        Mult_SSEPr(V1, V2, v);      Writeln(GetTickCount64 - Tick, 'ms');       assert(V.X = 1*4);      assert(V.Y = 2*5);      assert(V.Z = 3*6);    end.
output:
Size of TVector3 12 bytes
fpc function: 817ms
fpc procedure: 128ms
Hand code function: 185ms
Hand code procedure: 139ms

and for TVector3 of 4 Single
 
--- Code: Pascal  [+][-]window.onload = function(){var x1 = document.getElementById("main_content_section"); if (x1) { var x = document.getElementsByClassName("geshi");for (var i = 0; i < x.length; i++) { x[i].style.maxHeight='none'; x[i].style.height = Math.min(x[i].clientHeight+15,306)+'px'; x[i].style.resize = "vertical";}};} ---    type      TVector3 = record        X, Y, Z, A: Single;      end;
output:
Size of TVector3 16 bytes
fpc function: 1164ms
fpc procedure: 161ms
Hand code function: 945ms
Hand code procedure: 116ms

In my tests function has way worse results if TVector was 4 singles instead of 3.

Navigation

[0] Message Index

Go to full version