So here is an example for the 32 byte alignment
"foo" has whatever alignment it gets by surrounding code. Also, its loop is offset by the code in front of it.
It takes 4000 ms (on my PC: I7-8700)
Then the loop at exactly 32 byte aligned: 3640 ms (almost 10% faster)
The loop with an offset of 32+8 also is fast => so relevant code inside the loop must have just hit the right alignment.
The loop that is intentionally 32+16 takes 4000.
So (on modern CPU), just adding the right align can make a noticeable diff.
And since functions are aligned at 16 bytes, it depends on where the previous function ended. And be sometime fast, and sometime not.
Which also means, if you benchmark, and you change code in one place, then code in another place may be re-aligned, and be faster or slower. Your total benchmark then may change more by the accidental align change, than by the change you tried to measure.
See
https://lists.freepascal.org/pipermail/fpc-devel/2022-January/044336.htmlIncludes a very interesting video presentation on the topic
4000
4016
3640
3625
3610
3625
4015
4016
program Project1;
{$mode objfpc}{$H+}
uses
{$IFDEF UNIX}
cthreads,
{$ENDIF}
Classes, SysUtils
{ you can add units after this };
{$R *.res}
const
N = 150*1024*1024;
var
a, b, c: array of byte;
procedure foo;
var
i: Integer;
begin
c[0] := (a[0] + b[0]) div 2;
for i := 1 to N-1 do begin
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
end;
end;
procedure foo2;
var
i: Integer;
begin
c[0] := (a[0] + b[0]) div 2;
asm
.align 32
end;
for i := 1 to N-1 do begin
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
end;
end;
procedure foo3;
var
i: Integer;
begin
c[0] := (a[0] + b[0]) div 2;
asm
.align 32
nop
nop
nop
nop
nop
nop
nop
nop
end;
for i := 1 to N-1 do begin
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
end;
end;
procedure foo4;
var
i: Integer;
begin
c[0] := (a[0] + b[0]) div 2;
asm
.align 32
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
end;
for i := 1 to N-1 do begin
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];
end;
end;
var
t: QWord;
i: Integer;
begin
SetLength(a, N);
SetLength(b, N);
SetLength(c, N);
for i := 0 to N-1 do begin
a[i] := Random(255);
b[i] := Random(255);
end;
t := GetTickCount64;
foo;
t := GetTickCount64 -t;
writeln(t);
t := GetTickCount64;
foo;
t := GetTickCount64 -t;
writeln(t);
t := GetTickCount64;
foo2;
t := GetTickCount64 -t;
writeln(t);
t := GetTickCount64;
foo2;
t := GetTickCount64 -t;
writeln(t);
t := GetTickCount64;
foo3;
t := GetTickCount64 -t;
writeln(t);
t := GetTickCount64;
foo3;
t := GetTickCount64 -t;
writeln(t);
t := GetTickCount64;
foo4;
t := GetTickCount64 -t;
writeln(t);
t := GetTickCount64;
foo4;
t := GetTickCount64 -t;
writeln(t);
readln;
end.