So here is an example for the 32 byte alignment

"foo" has whatever alignment it gets by surrounding code. Also, its loop is offset by the code in front of it.

It takes 4000 ms (on my PC: I7-8700)

Then the loop at exactly 32 byte aligned: 3640 ms (almost 10% faster)

The loop with an offset of 32+8 also is fast => so relevant code inside the loop must have just hit the right alignment.

The loop that is intentionally 32+16 takes 4000.

So (on modern CPU), just adding the right align can make a noticeable diff.

And since functions are aligned at 16 bytes, it depends on where the previous function ended. And be sometime fast, and sometime not.

Which also means, if you benchmark, and you change code in one place, then code in another place may be re-aligned, and be faster or slower. Your total benchmark then may change more by the accidental align change, than by the change you tried to measure.

See

https://lists.freepascal.org/pipermail/fpc-devel/2022-January/044336.htmlIncludes a very interesting video presentation on the topic

4000

4016

3640

3625

3610

3625

4015

4016

program Project1;

{$mode objfpc}{$H+}

uses

{$IFDEF UNIX}

cthreads,

{$ENDIF}

Classes, SysUtils

{ you can add units after this };

{$R *.res}

const

N = 150*1024*1024;

var

a, b, c: array of byte;

procedure foo;

var

i: Integer;

begin

c[0] := (a[0] + b[0]) div 2;

for i := 1 to N-1 do begin

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

end;

end;

procedure foo2;

var

i: Integer;

begin

c[0] := (a[0] + b[0]) div 2;

asm

.align 32

end;

for i := 1 to N-1 do begin

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

end;

end;

procedure foo3;

var

i: Integer;

begin

c[0] := (a[0] + b[0]) div 2;

asm

.align 32

nop

nop

nop

nop

nop

nop

nop

nop

end;

for i := 1 to N-1 do begin

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

end;

end;

procedure foo4;

var

i: Integer;

begin

c[0] := (a[0] + b[0]) div 2;

asm

.align 32

nop

nop

nop

nop

nop

nop

nop

nop

nop

nop

nop

nop

nop

nop

nop

nop

end;

for i := 1 to N-1 do begin

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

c[i] := ( (a[i] + b[i]) div 2) xor c[i-1];

end;

end;

var

t: QWord;

i: Integer;

begin

SetLength(a, N);

SetLength(b, N);

SetLength(c, N);

for i := 0 to N-1 do begin

a[i] := Random(255);

b[i] := Random(255);

end;

t := GetTickCount64;

foo;

t := GetTickCount64 -t;

writeln(t);

t := GetTickCount64;

foo;

t := GetTickCount64 -t;

writeln(t);

t := GetTickCount64;

foo2;

t := GetTickCount64 -t;

writeln(t);

t := GetTickCount64;

foo2;

t := GetTickCount64 -t;

writeln(t);

t := GetTickCount64;

foo3;

t := GetTickCount64 -t;

writeln(t);

t := GetTickCount64;

foo3;

t := GetTickCount64 -t;

writeln(t);

t := GetTickCount64;

foo4;

t := GetTickCount64 -t;

writeln(t);

t := GetTickCount64;

foo4;

t := GetTickCount64 -t;

writeln(t);

readln;

end.