x64 32bit floating point operations is slower than x86

1. for fpc version you use

function Mult(const V1, V2: TVector3): TVector3;

it will give better results

Code: Pascal [Select][+]

function Mult(constref V1, V2: TVector3): TVector3;

2. reason for bad results for 64bit pascal implementation is the way how parameters are passed and result returned.

3. I did some testing my self
os : linux 64bit

Code: Pascal [Select][+]

    {$mode objfpc}
    {$asmmode intel}
    {$assertions on}
 
    uses
      SysUtils;
 
    type
      TVector3 = record
        X, Y, Z: Single;
      end;
 
    function Mult(constref V1, V2: TVector3): TVector3;
    begin
 
      Result.X := V1.X * V2.X;
      Result.Y := V1.Y * V2.Y;
      Result.Z := V1.Z * V2.Z;
    end;
 
    procedure MultPr(constref V1, V2: TVector3; var  Result: TVector3);
    begin
      Result.X := V1.X * V2.X;
      Result.Y := V1.Y * V2.Y;
      Result.Z := V1.Z * V2.Z;
    end;
 
    function Mult_SSE(constref V1, V2: TVector3): TVector3; assembler; nostackframe;
    asm
      movdqu xmm0, [v1]
      movdqu xmm1, [v2]
      mulps  xmm0 , xmm1
      movdqu xmm1, xmm0
      psrldq xmm1,8
    end;
 
    procedure Mult_SSEPr(constref V1, V2: TVector3; var Result: TVector3); assembler; nostackframe;
    asm
     {$if sizeof(TVector3)=12}
 
      movq xmm0, [v1]
      movq xmm1, [v2]
      mulps  xmm0 , xmm1
      movq [Result],xmm0
 
      movss xmm0, [v1+8]
      movss xmm1, [v2+8]
      mulss  xmm0 , xmm1
      movss [Result+8],xmm0
 
      {$else}
      {$if sizeof(TVector3)=16}
      movdqu xmm0, [v1]
      movdqu xmm1, [v2]
      mulps  xmm0 , xmm1
      movdqu [Result],xmm0
      {$else}
          {$fatal  sizeof(TVector3) has to be 12 or 16 bytes }
      {$endif}
      {$endif}
    end;
 
 
    var
      V, V1, V2: TVector3;
      I, Tick: Integer;
 
    begin
      writeln('Size of TVector3 ', sizeof(TVector3),' bytes');
      V1.X := 1; V1.Y := 2; V1.Z := 3;
      V2.X := 4; V2.Y := 5; V2.Z := 6;
      Write('fpc function: ');
      Tick := GetTickCount64;
      for I := 0 to 99999999 do
        V := Mult(V1, V2);
 
      Writeln(GetTickCount64 - Tick, 'ms');
 
      assert(V.X = 1*4);
      assert(V.Y = 2*5);
      assert(V.Z = 3*6);
 
      write ('fpc procedure: ');
      Tick := GetTickCount64;
      for I := 0 to 99999999 do
        MultPr(V1, V2, V);
 
      Writeln(GetTickCount64 - Tick, 'ms');
 
      assert(V.X = 1*4);
      assert(V.Y = 2*5);
      assert(V.Z = 3*6);
 
      Write('Hand code function: ');
      Tick := GetTickCount64;
      for I := 0 to 99999999 do
        V := Mult_SSE(V1, V2);
      Writeln(GetTickCount64 - Tick, 'ms');
 
      assert(V.X = 1*4);
      assert(V.Y = 2*5);
      assert(V.Z = 3*6);
 
      Write('Hand code procedure: ');
      Tick := GetTickCount64;
      for I := 0 to 99999999 do
        Mult_SSEPr(V1, V2, v);
      Writeln(GetTickCount64 - Tick, 'ms');
 
      assert(V.X = 1*4);
      assert(V.Y = 2*5);
      assert(V.Z = 3*6);
    end.

output:
Size of TVector3 12 bytes
fpc function: 817ms
fpc procedure: 128ms
Hand code function: 185ms
Hand code procedure: 139ms

and for TVector3 of 4 Single

Code: Pascal [Select][+]

    type
      TVector3 = record
        X, Y, Z, A: Single;
      end;

output:
Size of TVector3 16 bytes
fpc function: 1164ms
fpc procedure: 161ms
Hand code function: 945ms
Hand code procedure: 116ms

In my tests function has way worse results if TVector was 4 singles instead of 3.

Lazarus

Bookstore

Search

Recent

Author Topic: x64 32bit floating point operations is slower than x86 (Read 2745 times)

kagamma

x64 32bit floating point operations is slower than x86

mika

Re: x64 32bit floating point operations is slower than x86

	Computer Math and Games in Pascal (preview)
	Lazarus Handbook