Print Page - Collection of operator overloading for vector operations with SIMD

Collection of operator overloading for vector operation using Intel SIMD SSE instructions written in Free Pascal

https://github.com/zamronypj/oprsimd

What I am missing is a pure pascal version and the ability to use the compiler support for vector operations.
In the long run that might be a better option. (Not now, but in the future)

Good job, nevertheless!

p.s.: I am referring to the available -Sv option in combination with -CfXXX That combination is processor independent.

Hi if you are interesting by Vector lib using SIMD you can also take a look to my library here : https://github.com/jdelauney/SIMD-VectorMath-UnitTest

Best regards

Still, both of you need to test against the compiler .... I gave a hint on how to use them...

I've done some testing on performance and only if you embed asm SIMD code (no calling func or operators) you can achieve a x 2.5 performance improvement using SIMD vs. native code, so only in very specific situations this is an feasible option,

when using function calls or operators there's a 20-35% performance gain,

attached code is a bit messy just for testing purposes.

Code: Pascal [Select][+]

{
 testing SIMD intrinsic emulation
 
 conclusion: not worth using direct SIMD instructions as only a 25% of improvement is achieved
}
 
{$mode delphi}
 
program testSIMD;
 
uses
  SysUtils;
 
type
  //  {$ALIGN 16}
 
  vf32 = array of single;
  T4s = array[0..4 - 1] of single;
  T8s = array[0..8 - 1] of single; // 8 x 4 = 32 bytes x 8 = 256 bits
  T16s = array[0..16 - 1] of single;
  T4d = array [0..4 - 1] of double; // 4*64 = 256 bits = Y reg
 
  TVector = record
    x, y, z, w: single;
  public
    class operator +(const v1: TVector; const v2: TVector): TVector; inline;
  end;
 
  {$ASMMODE intel}
  function _add4(const a: T4s; const b: T4s): T4s; assembler; nostackframe;
  asm
           MOVAPS  XMM0, [a]
           MOVAPS  XMM2, [b]
           ADDPS   XMM0, XMM2
           MOVAPS  [result],XMM0
  end;
 
  function _add8(const a: T8s; const b: T8s): T8s; assembler; nostackframe;
  asm
           MOVUPS  XMM0, [a]
           MOVUPS  XMM2, [b]
           ADDPS   XMM0, XMM2
           MOVUPS  [result],XMM0
 
           MOVUPS  XMM0, [a+16]
           MOVUPS  XMM2, [b+16]
           ADDPS   XMM0, XMM2
           MOVUPS  [result+16],XMM0
  end;
 
  function add4(const a: T4s; const b: T4s): T4s; inline;
  var
    i: integer;
  begin
    for i := 0 to 3 do Result[i] := a[i] + b[i];
  end;
 
  function add8(const a: T8s; const b: T8s): T8s; inline;
  var
    i: integer;
  begin
    for i := 0 to 7 do Result[i] := a[i] + b[i];
  end;
 
  class operator TVector.+(const v1: TVector; const v2: TVector): TVector;
  assembler; nostackframe;
  asm
           MOVUPS  XMM0, [v1]
           ADDPS   XMM0, [v2]
           MOVUPS  [result], XMM0
  end;
const
  n = 200000000;
 
 
  procedure testVector;  // 2.5 performance when embedding asm code
  var
    a, b, c: TVector;
    i: integer;
    t0: int64;
  begin
    a.x := 1;
    a.y := 1;
    a.z := 1;
    a.w := 1;
    b.x := 2;
    b.y := 2;
    b.z := 2;
    b.w := 2;
 
    writeln('test vector');
    t0 := getTickCount64;
    for i := 0 to n do
      c := a + b;
    writeln('lap call func w/simd:', getTickCount64 - t0);
 
    t0 := getTickCount64;
    for i := 0 to n do
    begin
      c.x := a.x + b.x;
      c.y := a.y + b.y;
      c.z := a.z + b.z;
      c.w := a.w + b.w;
    end;
    writeln('lap fpc native:', getTickCount64 - t0);
 
 
    t0 := getTickCount64;
    for i := 0 to n do
      asm
               MOVUPS  XMM0, [a]
               ADDPS   XMM0, [b]
               MOVUPS  [c], XMM0
      end;
    writeln('lap embedded simd:', getTickCount64 - t0);
 
  end;
 
  procedure test4s;
  var
    a, b, c: T4s;
    s: single;
    i, t0: int64;
    ix0, ix1: integer;
  begin
    a[0] := 1;
    a[1] := 2;
    a[2] := 3;
    a[3] := 4;
 
    b[0] := 5;
    b[1] := 6;
    b[2] := 7;
    b[3] := 8;
 
    writeln('test4s');
 
    t0 := getTickCount64;
    for i := 0 to n do
      c := _add4(a, b);
    writeln('lap simd:', getTickCount64 - t0);
 
    t0 := getTickCount64;
 
    for ix0 := 0 to n do for ix1 := 0 to 3 do c[ix1] := a[ix1] + b[ix1];
    writeln('lap fpc inline:', getTickCount64 - t0);
 
    t0 := getTickCount64;
    for ix0 := 0 to n do c := add4(a, b);
    writeln('lap fpc func:', getTickCount64 - t0);
 
 
    for s in c do Write(format('%f ', [s]));
  end;
 
  procedure test8s;
  var
    a, b, c: T8s;
    s: single;
    i, j, t0: int64;
    ix0, ix1: integer;
  begin
    for i := 0 to 7 do
    begin
      a[i] := i;
      b[i] := i;
    end;
    writeln('test8s');
 
    // simd asm func   -> slow due to call
 
    t0 := getTickCount64;
    for i := 0 to n do c := _add8(a, b);
    writeln('lap simd asm func:', getTickCount64 - t0);
 
    fillchar(c, sizeof(c), 0);
 
    // simd w/asm embedded
    t0 := getTickCount64;
    for i := 0 to n do
    begin
      //c := _add8(a, b);
      asm // inline add 8s using 128 bit xmm regs.
               MOVAPS  XMM0, [a]
               MOVAPS  XMM1, [b]
               ADDPS   XMM0, XMM1
               MOVAPS  [c],XMM0
 
               MOVAPS  XMM0, [a+4*4] // offset is 4 single x 4 bytes
               MOVAPS  XMM1, [b+4*4]
               ADDPS   XMM0, XMM1
               MOVAPS  [c+4*4],XMM0
      end;
    end;
    writeln('lap code inserted asm simd using X 128 regs:', getTickCount64 - t0);
    for s in c do Write(format('%f ', [s]));
    writeln;
 
    // using  VMOVDQA YMM0,[a] // 256 bit reg = 8 x 32fp
    fillchar(c, sizeof(c), 0);
 
    t0 := getTickCount64;
    for i := 0 to n do
    begin
      asm // inline add 8s using 256 bit 'Y' regs
               VMOVDQU  YMM0,[a]
               VMOVDQU  YMM1,[b]
               VADDPS  YMM0, YMM0, YMM1
               VMOVDQU   [c],YMM0
      end;
    end;
    writeln('lap code inserted asm simd using Y 256 bit regs:', getTickCount64 - t0);
    for s in c do Write(format('%f ', [s]));
    writeln;
 
 
    t0 := getTickCount64;
    for i := 0 to n do for j := 0 to 7 do c[j] := a[j] + b[j];
    writeln('lap fpc native code :', getTickCount64 - t0);
  end;
 
  procedure test16s;
  var
    a16, b16, c16: T16s; // 16 x 32 = 512 bits
    s: single;
    i, t0, ts, tn: int64;
    ix0, ix1: integer;
  begin
    writeln('test 16 single array packed');
 
    for i := 0 to pred(16) do
    begin
      a16[i] := i;
      b16[i] := i;
      c16[i] := 0;
    end;
 
    t0 := getTickCount64;
    for i := 0 to n do
    begin
      asm // inline add 16s
 
               VMOVDQU  YMM0,[a16]
               VMOVDQU  YMM1,[b16]
               VADDPS  YMM0, YMM0, YMM1
               VMOVDQU   [c16],YMM0
 
               VMOVDQU  YMM0,[a16+8*4]
               VMOVDQU  YMM1,[b16+8*4]
               VADDPS  YMM0, YMM0, YMM1
               VMOVDQU   [c16+8*4],YMM0
      end;
    end;
    ts := getTickCount64 - t0;
    writeln('lap T16s, code inserted asm simd using two Y 256 bit regs:', ts);
    for s in c16 do Write(format('%f ', [s]));
    writeln;
 
    t0 := getTickCount64;
    for ix0 := 0 to n do for ix1 := 0 to 7 do c16[ix1] := a16[ix1] + b16[ix1];
    tn := getTickCount64 - t0;
    writeln('lap fpc sum expression using T16s:', tn, ' ratio:', 1.0 * tn / ts);
  end;
 
  procedure test8d;
  var
    a, b, c: T4d;
    pa, pb, pc: ^T4d;
    d: double;
    i, j: integer;
    t0: int64;
  begin
    writeln('test 4 x double');
 
    for i := 0 to 4 - 1 do
    begin
      a[i] := i;
      b[i] := i;
      c[i] := 0;
    end;
 
 
    pa := @a;
    pb := @b;
    pc := @c;
 
    // add 2 arrays of 4 doubles each
    t0 := gettickcount64;
    for i := 0 to n do
    begin
      asm
               MOV     R15,[pa]
               VMOVUPD  YMM0,[R15]   // ymm0=[pa]
               MOV     R15,[pb]
               VADDPD  YMM0, YMM0, [R15] // ymm0+=[pb]
               MOV     R15,[pc]
               VMOVUPD  [R15],YMM0 // [pc]=ymm0
      end;
    end;
    writeln('lap inline asm simd:', gettickcount64 - t0);
    for i := 0 to 4 - 1 do assert(c[i] = a[i] + b[i]);
    //Write(format('%f ', [pc[i]]));
    //writeln;
 
 
    t0 := gettickcount64;
 
    for i := 0 to n do
      for j := 0 to high(a) do
        c[j] := a[j] + b[j];
 
    writeln('lap fpc native expression:', gettickcount64 - t0);
    for d in c do Write(format('%f ', [d]));
    writeln;
  end;
 
  procedure testAddLargeVect;  { simd inline is 25%  faster than fpc native expression }
  var
    a, b, c: array of double;
    i, j: integer;
    t0: int64;
  begin
    Write('test 4 x double large vector add, filling data...');
 
    { create vectors }
    a := nil;
    b := nil;
    c := nil;
 
    setLength(a, n);
    setLength(b, n);
    setLength(c, n);
 
    { fill }
    for i := 0 to pred(n) do
    begin
      a[i] := i;
      b[i] := i;
      c[i] := 0;
    end;
    writeln('done');
 
    { simd inline asm code }
    t0 := gettickcount64;
 
    asm // preload regs, r13..r15 = pa..pc
             MOV     R13,[a[0]]   // r13=@a[0], r14=@b[0], r15=@c[0]
             MOV     R14,[b[0]]
             MOV     R15,[c[0]]
    end;
 
    for i := 0 to pred(n div 4) do // blocks of (256 / 64) 4 doubles
    begin
      asm // use 256 bit ymm regs.
               VMOVUPD  YMM0,[R13]   // ymm0=[pa]
               VADDPD  YMM0, YMM0, [R14] // ymm0+=[pb]
               VMOVUPD  [R15],YMM0 // [pc]=ymm0
 
               ADD     R13, 4 * sizeof(double)    // next 4 x double block
               ADD     R14, 4 * sizeof(double)
               ADD     R15, 4 * sizeof(double)
      end;
    end;
 
    writeln('lap inline asm simd:', gettickcount64 - t0);
 
    for j := 0 to high(c) do assert(c[j] = a[j] + b[j]);
    //writeln(format('%.0f %.0f %.0f', [a[j], b[j], c[j]]));
 
    { fpc native expression }
    fillChar(c[0], length(c) * sizeof(c[0]), 0);
 
    t0 := gettickcount64;
    for i := 0 to pred(n) do
      c[i] := a[i] + b[i];
    writeln('lap fpc native expression:', gettickcount64 - t0);
 
    for j := 0 to high(c) do assert(c[j] = a[j] + b[j]);
    //for j := high(a) to high(a) do writeln(format('%.0f %.0f %.0f', [a[j], b[j], c[j]]));
 
  end;
 
var
  i: integer;
begin
  testVector;
  //test16s;
  //test4s;
  //test8s;
  //test8d;
  //testAddLargeVect;
 
  writeln(#10: 3, 'end.');
  readln;
end.
 

Quote from: zamronypj on July 14, 2019, 01:30:02 pm

Collection of operator overloading for vector operation using Intel SIMD SSE instructions written in Free Pascal

https://github.com/zamronypj/oprsimd

hi,

on operators/func. parameter passing, regs (r8,rdx,rcx) are used but you address xmm#,

is this correct on fpc 3.2.2?

it doesn't work for me

Lazarus

Announcements => Third party => Topic started by: zamronypj on July 14, 2019, 01:30:02 pm