{
testing SIMD intrinsic emulation
conclusion: not worth using direct SIMD instructions as only a 25% of improvement is achieved
}
{$mode delphi}
program testSIMD;
uses
SysUtils;
type
// {$ALIGN 16}
vf32 = array of single;
T4s = array[0..4 - 1] of single;
T8s = array[0..8 - 1] of single; // 8 x 4 = 32 bytes x 8 = 256 bits
T16s = array[0..16 - 1] of single;
T4d = array [0..4 - 1] of double; // 4*64 = 256 bits = Y reg
TVector = record
x, y, z, w: single;
public
class operator +(const v1: TVector; const v2: TVector): TVector; inline;
end;
{$ASMMODE intel}
function _add4(const a: T4s; const b: T4s): T4s; assembler; nostackframe;
asm
MOVAPS XMM0, [a]
MOVAPS XMM2, [b]
ADDPS XMM0, XMM2
MOVAPS [result],XMM0
end;
function _add8(const a: T8s; const b: T8s): T8s; assembler; nostackframe;
asm
MOVUPS XMM0, [a]
MOVUPS XMM2, [b]
ADDPS XMM0, XMM2
MOVUPS [result],XMM0
MOVUPS XMM0, [a+16]
MOVUPS XMM2, [b+16]
ADDPS XMM0, XMM2
MOVUPS [result+16],XMM0
end;
function add4(const a: T4s; const b: T4s): T4s; inline;
var
i: integer;
begin
for i := 0 to 3 do Result[i] := a[i] + b[i];
end;
function add8(const a: T8s; const b: T8s): T8s; inline;
var
i: integer;
begin
for i := 0 to 7 do Result[i] := a[i] + b[i];
end;
class operator TVector.+(const v1: TVector; const v2: TVector): TVector;
assembler; nostackframe;
asm
MOVUPS XMM0, [v1]
ADDPS XMM0, [v2]
MOVUPS [result], XMM0
end;
const
n = 200000000;
procedure testVector; // 2.5 performance when embedding asm code
var
a, b, c: TVector;
i: integer;
t0: int64;
begin
a.x := 1;
a.y := 1;
a.z := 1;
a.w := 1;
b.x := 2;
b.y := 2;
b.z := 2;
b.w := 2;
writeln('test vector');
t0 := getTickCount64;
for i := 0 to n do
c := a + b;
writeln('lap call func w/simd:', getTickCount64 - t0);
t0 := getTickCount64;
for i := 0 to n do
begin
c.x := a.x + b.x;
c.y := a.y + b.y;
c.z := a.z + b.z;
c.w := a.w + b.w;
end;
writeln('lap fpc native:', getTickCount64 - t0);
t0 := getTickCount64;
for i := 0 to n do
asm
MOVUPS XMM0, [a]
ADDPS XMM0, [b]
MOVUPS [c], XMM0
end;
writeln('lap embedded simd:', getTickCount64 - t0);
end;
procedure test4s;
var
a, b, c: T4s;
s: single;
i, t0: int64;
ix0, ix1: integer;
begin
a[0] := 1;
a[1] := 2;
a[2] := 3;
a[3] := 4;
b[0] := 5;
b[1] := 6;
b[2] := 7;
b[3] := 8;
writeln('test4s');
t0 := getTickCount64;
for i := 0 to n do
c := _add4(a, b);
writeln('lap simd:', getTickCount64 - t0);
t0 := getTickCount64;
for ix0 := 0 to n do for ix1 := 0 to 3 do c[ix1] := a[ix1] + b[ix1];
writeln('lap fpc inline:', getTickCount64 - t0);
t0 := getTickCount64;
for ix0 := 0 to n do c := add4(a, b);
writeln('lap fpc func:', getTickCount64 - t0);
for s in c do Write(format('%f ', [s]));
end;
procedure test8s;
var
a, b, c: T8s;
s: single;
i, j, t0: int64;
ix0, ix1: integer;
begin
for i := 0 to 7 do
begin
a[i] := i;
b[i] := i;
end;
writeln('test8s');
// simd asm func -> slow due to call
t0 := getTickCount64;
for i := 0 to n do c := _add8(a, b);
writeln('lap simd asm func:', getTickCount64 - t0);
fillchar(c, sizeof(c), 0);
// simd w/asm embedded
t0 := getTickCount64;
for i := 0 to n do
begin
//c := _add8(a, b);
asm // inline add 8s using 128 bit xmm regs.
MOVAPS XMM0, [a]
MOVAPS XMM1, [b]
ADDPS XMM0, XMM1
MOVAPS [c],XMM0
MOVAPS XMM0, [a+4*4] // offset is 4 single x 4 bytes
MOVAPS XMM1, [b+4*4]
ADDPS XMM0, XMM1
MOVAPS [c+4*4],XMM0
end;
end;
writeln('lap code inserted asm simd using X 128 regs:', getTickCount64 - t0);
for s in c do Write(format('%f ', [s]));
writeln;
// using VMOVDQA YMM0,[a] // 256 bit reg = 8 x 32fp
fillchar(c, sizeof(c), 0);
t0 := getTickCount64;
for i := 0 to n do
begin
asm // inline add 8s using 256 bit 'Y' regs
VMOVDQU YMM0,[a]
VMOVDQU YMM1,[b]
VADDPS YMM0, YMM0, YMM1
VMOVDQU [c],YMM0
end;
end;
writeln('lap code inserted asm simd using Y 256 bit regs:', getTickCount64 - t0);
for s in c do Write(format('%f ', [s]));
writeln;
t0 := getTickCount64;
for i := 0 to n do for j := 0 to 7 do c[j] := a[j] + b[j];
writeln('lap fpc native code :', getTickCount64 - t0);
end;
procedure test16s;
var
a16, b16, c16: T16s; // 16 x 32 = 512 bits
s: single;
i, t0, ts, tn: int64;
ix0, ix1: integer;
begin
writeln('test 16 single array packed');
for i := 0 to pred(16) do
begin
a16[i] := i;
b16[i] := i;
c16[i] := 0;
end;
t0 := getTickCount64;
for i := 0 to n do
begin
asm // inline add 16s
VMOVDQU YMM0,[a16]
VMOVDQU YMM1,[b16]
VADDPS YMM0, YMM0, YMM1
VMOVDQU [c16],YMM0
VMOVDQU YMM0,[a16+8*4]
VMOVDQU YMM1,[b16+8*4]
VADDPS YMM0, YMM0, YMM1
VMOVDQU [c16+8*4],YMM0
end;
end;
ts := getTickCount64 - t0;
writeln('lap T16s, code inserted asm simd using two Y 256 bit regs:', ts);
for s in c16 do Write(format('%f ', [s]));
writeln;
t0 := getTickCount64;
for ix0 := 0 to n do for ix1 := 0 to 7 do c16[ix1] := a16[ix1] + b16[ix1];
tn := getTickCount64 - t0;
writeln('lap fpc sum expression using T16s:', tn, ' ratio:', 1.0 * tn / ts);
end;
procedure test8d;
var
a, b, c: T4d;
pa, pb, pc: ^T4d;
d: double;
i, j: integer;
t0: int64;
begin
writeln('test 4 x double');
for i := 0 to 4 - 1 do
begin
a[i] := i;
b[i] := i;
c[i] := 0;
end;
pa := @a;
pb := @b;
pc := @c;
// add 2 arrays of 4 doubles each
t0 := gettickcount64;
for i := 0 to n do
begin
asm
MOV R15,[pa]
VMOVUPD YMM0,[R15] // ymm0=[pa]
MOV R15,[pb]
VADDPD YMM0, YMM0, [R15] // ymm0+=[pb]
MOV R15,[pc]
VMOVUPD [R15],YMM0 // [pc]=ymm0
end;
end;
writeln('lap inline asm simd:', gettickcount64 - t0);
for i := 0 to 4 - 1 do assert(c[i] = a[i] + b[i]);
//Write(format('%f ', [pc[i]]));
//writeln;
t0 := gettickcount64;
for i := 0 to n do
for j := 0 to high(a) do
c[j] := a[j] + b[j];
writeln('lap fpc native expression:', gettickcount64 - t0);
for d in c do Write(format('%f ', [d]));
writeln;
end;
procedure testAddLargeVect; { simd inline is 25% faster than fpc native expression }
var
a, b, c: array of double;
i, j: integer;
t0: int64;
begin
Write('test 4 x double large vector add, filling data...');
{ create vectors }
a := nil;
b := nil;
c := nil;
setLength(a, n);
setLength(b, n);
setLength(c, n);
{ fill }
for i := 0 to pred(n) do
begin
a[i] := i;
b[i] := i;
c[i] := 0;
end;
writeln('done');
{ simd inline asm code }
t0 := gettickcount64;
asm // preload regs, r13..r15 = pa..pc
MOV R13,[a[0]] // r13=@a[0], r14=@b[0], r15=@c[0]
MOV R14,[b[0]]
MOV R15,[c[0]]
end;
for i := 0 to pred(n div 4) do // blocks of (256 / 64) 4 doubles
begin
asm // use 256 bit ymm regs.
VMOVUPD YMM0,[R13] // ymm0=[pa]
VADDPD YMM0, YMM0, [R14] // ymm0+=[pb]
VMOVUPD [R15],YMM0 // [pc]=ymm0
ADD R13, 4 * sizeof(double) // next 4 x double block
ADD R14, 4 * sizeof(double)
ADD R15, 4 * sizeof(double)
end;
end;
writeln('lap inline asm simd:', gettickcount64 - t0);
for j := 0 to high(c) do assert(c[j] = a[j] + b[j]);
//writeln(format('%.0f %.0f %.0f', [a[j], b[j], c[j]]));
{ fpc native expression }
fillChar(c[0], length(c) * sizeof(c[0]), 0);
t0 := gettickcount64;
for i := 0 to pred(n) do
c[i] := a[i] + b[i];
writeln('lap fpc native expression:', gettickcount64 - t0);
for j := 0 to high(c) do assert(c[j] = a[j] + b[j]);
//for j := high(a) to high(a) do writeln(format('%.0f %.0f %.0f', [a[j], b[j], c[j]]));
end;
var
i: integer;
begin
testVector;
//test16s;
//test4s;
//test8s;
//test8d;
//testAddLargeVect;
writeln(#10: 3, 'end.');
readln;
end.