Lazarus

Announcements => Third party => Topic started by: zamronypj on July 14, 2019, 01:30:02 pm

Title: Collection of operator overloading for vector operations with SIMD
Post by: zamronypj on July 14, 2019, 01:30:02 pm
Collection of operator overloading for vector operation using Intel SIMD SSE instructions written in Free Pascal

https://github.com/zamronypj/oprsimd
Title: Re: Collection of operator overloading for vector operations with SIMD
Post by: Thaddy on July 14, 2019, 02:35:18 pm
What I am missing is a pure pascal version and the ability to use the compiler support for vector operations.
In the long run that might be a better option. (Not now, but in the future)

Good job, nevertheless!

p.s.: I am referring to the available -Sv option in combination with -CfXXX That combination is processor independent.
Title: Re: Collection of operator overloading for vector operations with SIMD
Post by: BeanzMaster on July 14, 2019, 05:47:25 pm
Hi if you are interesting by Vector lib using SIMD you can also take a look to my library here : https://github.com/jdelauney/SIMD-VectorMath-UnitTest

Best regards
Title: Re: Collection of operator overloading for vector operations with SIMD
Post by: Thaddy on July 14, 2019, 06:13:49 pm
Still, both of you need to test against the compiler .... I gave a hint on how to use them...
Title: Re: Collection of operator overloading for vector operations with SIMD
Post by: rforcen on April 04, 2024, 10:01:35 am
I've done some testing on performance and only if you embed asm SIMD code (no calling func or operators) you can achieve a x 2.5 performance improvement using SIMD vs. native code, so only in very specific situations this is an feasible option,

when using function calls or operators there's  a 20-35% performance gain,

attached code is a bit messy just for testing purposes.

Code: Pascal  [Select][+][-]
  1. {
  2.  testing SIMD intrinsic emulation
  3.  
  4.  conclusion: not worth using direct SIMD instructions as only a 25% of improvement is achieved
  5. }
  6.  
  7. {$mode delphi}
  8.  
  9. program testSIMD;
  10.  
  11. uses
  12.   SysUtils;
  13.  
  14. type
  15.   //  {$ALIGN 16}
  16.  
  17.   vf32 = array of single;
  18.   T4s = array[0..4 - 1] of single;
  19.   T8s = array[0..8 - 1] of single; // 8 x 4 = 32 bytes x 8 = 256 bits
  20.   T16s = array[0..16 - 1] of single;
  21.   T4d = array [0..4 - 1] of double; // 4*64 = 256 bits = Y reg
  22.  
  23.   TVector = record
  24.     x, y, z, w: single;
  25.   public
  26.     class operator +(const v1: TVector; const v2: TVector): TVector; inline;
  27.   end;
  28.  
  29.   {$ASMMODE intel}
  30.   function _add4(const a: T4s; const b: T4s): T4s; assembler; nostackframe;
  31.   asm
  32.            MOVAPS  XMM0, [a]
  33.            MOVAPS  XMM2, [b]
  34.            ADDPS   XMM0, XMM2
  35.            MOVAPS  [result],XMM0
  36.   end;
  37.  
  38.   function _add8(const a: T8s; const b: T8s): T8s; assembler; nostackframe;
  39.   asm
  40.            MOVUPS  XMM0, [a]
  41.            MOVUPS  XMM2, [b]
  42.            ADDPS   XMM0, XMM2
  43.            MOVUPS  [result],XMM0
  44.  
  45.            MOVUPS  XMM0, [a+16]
  46.            MOVUPS  XMM2, [b+16]
  47.            ADDPS   XMM0, XMM2
  48.            MOVUPS  [result+16],XMM0
  49.   end;
  50.  
  51.   function add4(const a: T4s; const b: T4s): T4s; inline;
  52.   var
  53.     i: integer;
  54.   begin
  55.     for i := 0 to 3 do Result[i] := a[i] + b[i];
  56.   end;
  57.  
  58.   function add8(const a: T8s; const b: T8s): T8s; inline;
  59.   var
  60.     i: integer;
  61.   begin
  62.     for i := 0 to 7 do Result[i] := a[i] + b[i];
  63.   end;
  64.  
  65.   class operator TVector.+(const v1: TVector; const v2: TVector): TVector;
  66.   assembler; nostackframe;
  67.   asm
  68.            MOVUPS  XMM0, [v1]
  69.            ADDPS   XMM0, [v2]
  70.            MOVUPS  [result], XMM0
  71.   end;
  72. const
  73.   n = 200000000;
  74.  
  75.  
  76.   procedure testVector;  // 2.5 performance when embedding asm code
  77.   var
  78.     a, b, c: TVector;
  79.     i: integer;
  80.     t0: int64;
  81.   begin
  82.     a.x := 1;
  83.     a.y := 1;
  84.     a.z := 1;
  85.     a.w := 1;
  86.     b.x := 2;
  87.     b.y := 2;
  88.     b.z := 2;
  89.     b.w := 2;
  90.  
  91.     writeln('test vector');
  92.     t0 := getTickCount64;
  93.     for i := 0 to n do
  94.       c := a + b;
  95.     writeln('lap call func w/simd:', getTickCount64 - t0);
  96.  
  97.     t0 := getTickCount64;
  98.     for i := 0 to n do
  99.     begin
  100.       c.x := a.x + b.x;
  101.       c.y := a.y + b.y;
  102.       c.z := a.z + b.z;
  103.       c.w := a.w + b.w;
  104.     end;
  105.     writeln('lap fpc native:', getTickCount64 - t0);
  106.  
  107.  
  108.     t0 := getTickCount64;
  109.     for i := 0 to n do
  110.       asm
  111.                MOVUPS  XMM0, [a]
  112.                ADDPS   XMM0, [b]
  113.                MOVUPS  [c], XMM0
  114.       end;
  115.     writeln('lap embedded simd:', getTickCount64 - t0);
  116.  
  117.   end;
  118.  
  119.   procedure test4s;
  120.   var
  121.     a, b, c: T4s;
  122.     s: single;
  123.     i, t0: int64;
  124.     ix0, ix1: integer;
  125.   begin
  126.     a[0] := 1;
  127.     a[1] := 2;
  128.     a[2] := 3;
  129.     a[3] := 4;
  130.  
  131.     b[0] := 5;
  132.     b[1] := 6;
  133.     b[2] := 7;
  134.     b[3] := 8;
  135.  
  136.     writeln('test4s');
  137.  
  138.     t0 := getTickCount64;
  139.     for i := 0 to n do
  140.       c := _add4(a, b);
  141.     writeln('lap simd:', getTickCount64 - t0);
  142.  
  143.     t0 := getTickCount64;
  144.  
  145.     for ix0 := 0 to n do for ix1 := 0 to 3 do c[ix1] := a[ix1] + b[ix1];
  146.     writeln('lap fpc inline:', getTickCount64 - t0);
  147.  
  148.     t0 := getTickCount64;
  149.     for ix0 := 0 to n do c := add4(a, b);
  150.     writeln('lap fpc func:', getTickCount64 - t0);
  151.  
  152.  
  153.     for s in c do Write(format('%f ', [s]));
  154.   end;
  155.  
  156.   procedure test8s;
  157.   var
  158.     a, b, c: T8s;
  159.     s: single;
  160.     i, j, t0: int64;
  161.     ix0, ix1: integer;
  162.   begin
  163.     for i := 0 to 7 do
  164.     begin
  165.       a[i] := i;
  166.       b[i] := i;
  167.     end;
  168.     writeln('test8s');
  169.  
  170.     // simd asm func   -> slow due to call
  171.  
  172.     t0 := getTickCount64;
  173.     for i := 0 to n do c := _add8(a, b);
  174.     writeln('lap simd asm func:', getTickCount64 - t0);
  175.  
  176.     fillchar(c, sizeof(c), 0);
  177.  
  178.     // simd w/asm embedded
  179.     t0 := getTickCount64;
  180.     for i := 0 to n do
  181.     begin
  182.       //c := _add8(a, b);
  183.       asm // inline add 8s using 128 bit xmm regs.
  184.                MOVAPS  XMM0, [a]
  185.                MOVAPS  XMM1, [b]
  186.                ADDPS   XMM0, XMM1
  187.                MOVAPS  [c],XMM0
  188.  
  189.                MOVAPS  XMM0, [a+4*4] // offset is 4 single x 4 bytes
  190.                MOVAPS  XMM1, [b+4*4]
  191.                ADDPS   XMM0, XMM1
  192.                MOVAPS  [c+4*4],XMM0
  193.       end;
  194.     end;
  195.     writeln('lap code inserted asm simd using X 128 regs:', getTickCount64 - t0);
  196.     for s in c do Write(format('%f ', [s]));
  197.     writeln;
  198.  
  199.     // using  VMOVDQA YMM0,[a] // 256 bit reg = 8 x 32fp
  200.     fillchar(c, sizeof(c), 0);
  201.  
  202.     t0 := getTickCount64;
  203.     for i := 0 to n do
  204.     begin
  205.       asm // inline add 8s using 256 bit 'Y' regs
  206.                VMOVDQU  YMM0,[a]
  207.                VMOVDQU  YMM1,[b]
  208.                VADDPS  YMM0, YMM0, YMM1
  209.                VMOVDQU   [c],YMM0
  210.       end;
  211.     end;
  212.     writeln('lap code inserted asm simd using Y 256 bit regs:', getTickCount64 - t0);
  213.     for s in c do Write(format('%f ', [s]));
  214.     writeln;
  215.  
  216.  
  217.     t0 := getTickCount64;
  218.     for i := 0 to n do for j := 0 to 7 do c[j] := a[j] + b[j];
  219.     writeln('lap fpc native code :', getTickCount64 - t0);
  220.   end;
  221.  
  222.   procedure test16s;
  223.   var
  224.     a16, b16, c16: T16s; // 16 x 32 = 512 bits
  225.     s: single;
  226.     i, t0, ts, tn: int64;
  227.     ix0, ix1: integer;
  228.   begin
  229.     writeln('test 16 single array packed');
  230.  
  231.     for i := 0 to pred(16) do
  232.     begin
  233.       a16[i] := i;
  234.       b16[i] := i;
  235.       c16[i] := 0;
  236.     end;
  237.  
  238.     t0 := getTickCount64;
  239.     for i := 0 to n do
  240.     begin
  241.       asm // inline add 16s
  242.  
  243.                VMOVDQU  YMM0,[a16]
  244.                VMOVDQU  YMM1,[b16]
  245.                VADDPS  YMM0, YMM0, YMM1
  246.                VMOVDQU   [c16],YMM0
  247.  
  248.                VMOVDQU  YMM0,[a16+8*4]
  249.                VMOVDQU  YMM1,[b16+8*4]
  250.                VADDPS  YMM0, YMM0, YMM1
  251.                VMOVDQU   [c16+8*4],YMM0
  252.       end;
  253.     end;
  254.     ts := getTickCount64 - t0;
  255.     writeln('lap T16s, code inserted asm simd using two Y 256 bit regs:', ts);
  256.     for s in c16 do Write(format('%f ', [s]));
  257.     writeln;
  258.  
  259.     t0 := getTickCount64;
  260.     for ix0 := 0 to n do for ix1 := 0 to 7 do c16[ix1] := a16[ix1] + b16[ix1];
  261.     tn := getTickCount64 - t0;
  262.     writeln('lap fpc sum expression using T16s:', tn, ' ratio:', 1.0 * tn / ts);
  263.   end;
  264.  
  265.   procedure test8d;
  266.   var
  267.     a, b, c: T4d;
  268.     pa, pb, pc: ^T4d;
  269.     d: double;
  270.     i, j: integer;
  271.     t0: int64;
  272.   begin
  273.     writeln('test 4 x double');
  274.  
  275.     for i := 0 to 4 - 1 do
  276.     begin
  277.       a[i] := i;
  278.       b[i] := i;
  279.       c[i] := 0;
  280.     end;
  281.  
  282.  
  283.     pa := @a;
  284.     pb := @b;
  285.     pc := @c;
  286.  
  287.     // add 2 arrays of 4 doubles each
  288.     t0 := gettickcount64;
  289.     for i := 0 to n do
  290.     begin
  291.       asm
  292.                MOV     R15,[pa]
  293.                VMOVUPD  YMM0,[R15]   // ymm0=[pa]
  294.                MOV     R15,[pb]
  295.                VADDPD  YMM0, YMM0, [R15] // ymm0+=[pb]
  296.                MOV     R15,[pc]
  297.                VMOVUPD  [R15],YMM0 // [pc]=ymm0
  298.       end;
  299.     end;
  300.     writeln('lap inline asm simd:', gettickcount64 - t0);
  301.     for i := 0 to 4 - 1 do assert(c[i] = a[i] + b[i]);
  302.     //Write(format('%f ', [pc[i]]));
  303.     //writeln;
  304.  
  305.  
  306.     t0 := gettickcount64;
  307.  
  308.     for i := 0 to n do
  309.       for j := 0 to high(a) do
  310.         c[j] := a[j] + b[j];
  311.  
  312.     writeln('lap fpc native expression:', gettickcount64 - t0);
  313.     for d in c do Write(format('%f ', [d]));
  314.     writeln;
  315.   end;
  316.  
  317.   procedure testAddLargeVect;  { simd inline is 25%  faster than fpc native expression }
  318.   var
  319.     a, b, c: array of double;
  320.     i, j: integer;
  321.     t0: int64;
  322.   begin
  323.     Write('test 4 x double large vector add, filling data...');
  324.  
  325.     { create vectors }
  326.     a := nil;
  327.     b := nil;
  328.     c := nil;
  329.  
  330.     setLength(a, n);
  331.     setLength(b, n);
  332.     setLength(c, n);
  333.  
  334.     { fill }
  335.     for i := 0 to pred(n) do
  336.     begin
  337.       a[i] := i;
  338.       b[i] := i;
  339.       c[i] := 0;
  340.     end;
  341.     writeln('done');
  342.  
  343.     { simd inline asm code }
  344.     t0 := gettickcount64;
  345.  
  346.     asm // preload regs, r13..r15 = pa..pc
  347.              MOV     R13,[a[0]]   // r13=@a[0], r14=@b[0], r15=@c[0]
  348.              MOV     R14,[b[0]]
  349.              MOV     R15,[c[0]]
  350.     end;
  351.  
  352.     for i := 0 to pred(n div 4) do // blocks of (256 / 64) 4 doubles
  353.     begin
  354.       asm // use 256 bit ymm regs.
  355.                VMOVUPD  YMM0,[R13]   // ymm0=[pa]
  356.                VADDPD  YMM0, YMM0, [R14] // ymm0+=[pb]
  357.                VMOVUPD  [R15],YMM0 // [pc]=ymm0
  358.  
  359.                ADD     R13, 4 * sizeof(double)    // next 4 x double block
  360.                ADD     R14, 4 * sizeof(double)
  361.                ADD     R15, 4 * sizeof(double)
  362.       end;
  363.     end;
  364.  
  365.     writeln('lap inline asm simd:', gettickcount64 - t0);
  366.  
  367.     for j := 0 to high(c) do assert(c[j] = a[j] + b[j]);
  368.     //writeln(format('%.0f %.0f %.0f', [a[j], b[j], c[j]]));
  369.  
  370.     { fpc native expression }
  371.     fillChar(c[0], length(c) * sizeof(c[0]), 0);
  372.  
  373.     t0 := gettickcount64;
  374.     for i := 0 to pred(n) do
  375.       c[i] := a[i] + b[i];
  376.     writeln('lap fpc native expression:', gettickcount64 - t0);
  377.  
  378.     for j := 0 to high(c) do assert(c[j] = a[j] + b[j]);
  379.     //for j := high(a) to high(a) do writeln(format('%.0f %.0f %.0f', [a[j], b[j], c[j]]));
  380.  
  381.   end;
  382.  
  383. var
  384.   i: integer;
  385. begin
  386.   testVector;
  387.   //test16s;
  388.   //test4s;
  389.   //test8s;
  390.   //test8d;
  391.   //testAddLargeVect;
  392.  
  393.   writeln(#10: 3, 'end.');
  394.   readln;
  395. end.
  396.  
Title: Re: Collection of operator overloading for vector operations with SIMD
Post by: rforcen on April 04, 2024, 10:07:11 am
Collection of operator overloading for vector operation using Intel SIMD SSE instructions written in Free Pascal

https://github.com/zamronypj/oprsimd
hi,

on operators/func. parameter passing, regs (r8,rdx,rcx) are used but you address xmm#,

is this correct on fpc 3.2.2?

it doesn't work for me
TinyPortal © 2005-2018