Forum > Third party

Collection of operator overloading for vector operations with SIMD

(1/2) > >>

zamronypj:
Collection of operator overloading for vector operation using Intel SIMD SSE instructions written in Free Pascal

https://github.com/zamronypj/oprsimd

Thaddy:
What I am missing is a pure pascal version and the ability to use the compiler support for vector operations.
In the long run that might be a better option. (Not now, but in the future)

Good job, nevertheless!

p.s.: I am referring to the available -Sv option in combination with -CfXXX That combination is processor independent.

BeanzMaster:
Hi if you are interesting by Vector lib using SIMD you can also take a look to my library here : https://github.com/jdelauney/SIMD-VectorMath-UnitTest

Best regards

Thaddy:
Still, both of you need to test against the compiler .... I gave a hint on how to use them...

rforcen:
I've done some testing on performance and only if you embed asm SIMD code (no calling func or operators) you can achieve a x 2.5 performance improvement using SIMD vs. native code, so only in very specific situations this is an feasible option,

when using function calls or operators there's  a 20-35% performance gain,

attached code is a bit messy just for testing purposes.


--- Code: Pascal  [+][-]window.onload = function(){var x1 = document.getElementById("main_content_section"); if (x1) { var x = document.getElementsByClassName("geshi");for (var i = 0; i < x.length; i++) { x[i].style.maxHeight='none'; x[i].style.height = Math.min(x[i].clientHeight+15,306)+'px'; x[i].style.resize = "vertical";}};} ---{ testing SIMD intrinsic emulation  conclusion: not worth using direct SIMD instructions as only a 25% of improvement is achieved} {$mode delphi} program testSIMD; uses  SysUtils; type  //  {$ALIGN 16}   vf32 = array of single;  T4s = array[0..4 - 1] of single;  T8s = array[0..8 - 1] of single; // 8 x 4 = 32 bytes x 8 = 256 bits  T16s = array[0..16 - 1] of single;  T4d = array [0..4 - 1] of double; // 4*64 = 256 bits = Y reg   TVector = record    x, y, z, w: single;  public    class operator +(const v1: TVector; const v2: TVector): TVector; inline;  end;   {$ASMMODE intel}  function _add4(const a: T4s; const b: T4s): T4s; assembler; nostackframe;  asm           MOVAPS  XMM0, [a]           MOVAPS  XMM2, [b]           ADDPS   XMM0, XMM2           MOVAPS  [result],XMM0  end;   function _add8(const a: T8s; const b: T8s): T8s; assembler; nostackframe;  asm           MOVUPS  XMM0, [a]           MOVUPS  XMM2, [b]           ADDPS   XMM0, XMM2           MOVUPS  [result],XMM0            MOVUPS  XMM0, [a+16]           MOVUPS  XMM2, [b+16]           ADDPS   XMM0, XMM2           MOVUPS  [result+16],XMM0  end;   function add4(const a: T4s; const b: T4s): T4s; inline;  var    i: integer;  begin    for i := 0 to 3 do Result[i] := a[i] + b[i];  end;   function add8(const a: T8s; const b: T8s): T8s; inline;  var    i: integer;  begin    for i := 0 to 7 do Result[i] := a[i] + b[i];  end;   class operator TVector.+(const v1: TVector; const v2: TVector): TVector;  assembler; nostackframe;  asm           MOVUPS  XMM0, [v1]           ADDPS   XMM0, [v2]           MOVUPS  [result], XMM0  end;const  n = 200000000;    procedure testVector;  // 2.5 performance when embedding asm code  var    a, b, c: TVector;    i: integer;    t0: int64;  begin    a.x := 1;    a.y := 1;    a.z := 1;    a.w := 1;    b.x := 2;    b.y := 2;    b.z := 2;    b.w := 2;     writeln('test vector');    t0 := getTickCount64;    for i := 0 to n do      c := a + b;    writeln('lap call func w/simd:', getTickCount64 - t0);     t0 := getTickCount64;    for i := 0 to n do    begin      c.x := a.x + b.x;      c.y := a.y + b.y;      c.z := a.z + b.z;      c.w := a.w + b.w;    end;    writeln('lap fpc native:', getTickCount64 - t0);      t0 := getTickCount64;    for i := 0 to n do      asm               MOVUPS  XMM0, [a]               ADDPS   XMM0, [b]               MOVUPS  [c], XMM0      end;    writeln('lap embedded simd:', getTickCount64 - t0);   end;   procedure test4s;  var    a, b, c: T4s;    s: single;    i, t0: int64;    ix0, ix1: integer;  begin    a[0] := 1;    a[1] := 2;    a[2] := 3;    a[3] := 4;     b[0] := 5;    b[1] := 6;    b[2] := 7;    b[3] := 8;     writeln('test4s');     t0 := getTickCount64;    for i := 0 to n do      c := _add4(a, b);    writeln('lap simd:', getTickCount64 - t0);     t0 := getTickCount64;     for ix0 := 0 to n do for ix1 := 0 to 3 do c[ix1] := a[ix1] + b[ix1];    writeln('lap fpc inline:', getTickCount64 - t0);     t0 := getTickCount64;    for ix0 := 0 to n do c := add4(a, b);    writeln('lap fpc func:', getTickCount64 - t0);      for s in c do Write(format('%f ', [s]));  end;   procedure test8s;  var    a, b, c: T8s;    s: single;    i, j, t0: int64;    ix0, ix1: integer;  begin    for i := 0 to 7 do    begin      a[i] := i;      b[i] := i;    end;    writeln('test8s');     // simd asm func   -> slow due to call     t0 := getTickCount64;    for i := 0 to n do c := _add8(a, b);    writeln('lap simd asm func:', getTickCount64 - t0);     fillchar(c, sizeof(c), 0);     // simd w/asm embedded    t0 := getTickCount64;    for i := 0 to n do    begin      //c := _add8(a, b);      asm // inline add 8s using 128 bit xmm regs.               MOVAPS  XMM0, [a]               MOVAPS  XMM1, [b]               ADDPS   XMM0, XMM1               MOVAPS  [c],XMM0                MOVAPS  XMM0, [a+4*4] // offset is 4 single x 4 bytes               MOVAPS  XMM1, [b+4*4]               ADDPS   XMM0, XMM1               MOVAPS  [c+4*4],XMM0      end;    end;    writeln('lap code inserted asm simd using X 128 regs:', getTickCount64 - t0);    for s in c do Write(format('%f ', [s]));    writeln;     // using  VMOVDQA YMM0,[a] // 256 bit reg = 8 x 32fp    fillchar(c, sizeof(c), 0);     t0 := getTickCount64;    for i := 0 to n do    begin      asm // inline add 8s using 256 bit 'Y' regs               VMOVDQU  YMM0,[a]               VMOVDQU  YMM1,[b]               VADDPS  YMM0, YMM0, YMM1               VMOVDQU   [c],YMM0      end;    end;    writeln('lap code inserted asm simd using Y 256 bit regs:', getTickCount64 - t0);    for s in c do Write(format('%f ', [s]));    writeln;      t0 := getTickCount64;    for i := 0 to n do for j := 0 to 7 do c[j] := a[j] + b[j];    writeln('lap fpc native code :', getTickCount64 - t0);  end;   procedure test16s;  var    a16, b16, c16: T16s; // 16 x 32 = 512 bits    s: single;    i, t0, ts, tn: int64;    ix0, ix1: integer;  begin    writeln('test 16 single array packed');     for i := 0 to pred(16) do    begin      a16[i] := i;      b16[i] := i;      c16[i] := 0;    end;     t0 := getTickCount64;    for i := 0 to n do    begin      asm // inline add 16s                VMOVDQU  YMM0,[a16]               VMOVDQU  YMM1,[b16]               VADDPS  YMM0, YMM0, YMM1               VMOVDQU   [c16],YMM0                VMOVDQU  YMM0,[a16+8*4]               VMOVDQU  YMM1,[b16+8*4]               VADDPS  YMM0, YMM0, YMM1               VMOVDQU   [c16+8*4],YMM0      end;    end;    ts := getTickCount64 - t0;    writeln('lap T16s, code inserted asm simd using two Y 256 bit regs:', ts);    for s in c16 do Write(format('%f ', [s]));    writeln;     t0 := getTickCount64;    for ix0 := 0 to n do for ix1 := 0 to 7 do c16[ix1] := a16[ix1] + b16[ix1];    tn := getTickCount64 - t0;    writeln('lap fpc sum expression using T16s:', tn, ' ratio:', 1.0 * tn / ts);  end;   procedure test8d;  var    a, b, c: T4d;    pa, pb, pc: ^T4d;    d: double;    i, j: integer;    t0: int64;  begin    writeln('test 4 x double');     for i := 0 to 4 - 1 do    begin      a[i] := i;      b[i] := i;      c[i] := 0;    end;      pa := @a;    pb := @b;    pc := @c;     // add 2 arrays of 4 doubles each    t0 := gettickcount64;    for i := 0 to n do    begin      asm               MOV     R15,[pa]               VMOVUPD  YMM0,[R15]   // ymm0=[pa]               MOV     R15,[pb]               VADDPD  YMM0, YMM0, [R15] // ymm0+=[pb]               MOV     R15,[pc]               VMOVUPD  [R15],YMM0 // [pc]=ymm0      end;    end;    writeln('lap inline asm simd:', gettickcount64 - t0);    for i := 0 to 4 - 1 do assert(c[i] = a[i] + b[i]);    //Write(format('%f ', [pc[i]]));    //writeln;      t0 := gettickcount64;     for i := 0 to n do      for j := 0 to high(a) do        c[j] := a[j] + b[j];     writeln('lap fpc native expression:', gettickcount64 - t0);    for d in c do Write(format('%f ', [d]));    writeln;  end;   procedure testAddLargeVect;  { simd inline is 25%  faster than fpc native expression }  var    a, b, c: array of double;    i, j: integer;    t0: int64;  begin    Write('test 4 x double large vector add, filling data...');     { create vectors }    a := nil;    b := nil;    c := nil;     setLength(a, n);    setLength(b, n);    setLength(c, n);     { fill }    for i := 0 to pred(n) do    begin      a[i] := i;      b[i] := i;      c[i] := 0;    end;    writeln('done');     { simd inline asm code }    t0 := gettickcount64;     asm // preload regs, r13..r15 = pa..pc             MOV     R13,[a[0]]   // r13=@a[0], r14=@b[0], r15=@c[0]             MOV     R14,[b[0]]             MOV     R15,[c[0]]    end;     for i := 0 to pred(n div 4) do // blocks of (256 / 64) 4 doubles    begin      asm // use 256 bit ymm regs.               VMOVUPD  YMM0,[R13]   // ymm0=[pa]               VADDPD  YMM0, YMM0, [R14] // ymm0+=[pb]               VMOVUPD  [R15],YMM0 // [pc]=ymm0                ADD     R13, 4 * sizeof(double)    // next 4 x double block               ADD     R14, 4 * sizeof(double)               ADD     R15, 4 * sizeof(double)      end;    end;     writeln('lap inline asm simd:', gettickcount64 - t0);     for j := 0 to high(c) do assert(c[j] = a[j] + b[j]);    //writeln(format('%.0f %.0f %.0f', [a[j], b[j], c[j]]));     { fpc native expression }    fillChar(c[0], length(c) * sizeof(c[0]), 0);     t0 := gettickcount64;    for i := 0 to pred(n) do      c[i] := a[i] + b[i];    writeln('lap fpc native expression:', gettickcount64 - t0);     for j := 0 to high(c) do assert(c[j] = a[j] + b[j]);    //for j := high(a) to high(a) do writeln(format('%.0f %.0f %.0f', [a[j], b[j], c[j]]));   end; var  i: integer;begin  testVector;  //test16s;  //test4s;  //test8s;  //test8d;  //testAddLargeVect;   writeln(#10: 3, 'end.');  readln;end. 

Navigation

[0] Message Index

[#] Next page

Go to full version