Try in win64 (I mostly use 64-bit)
{$mode delphi}
type
Single4 = record
case boolean of
true :(asvec: array[0..3] of single);
false:(x,y,z,w : single);
end;
procedure tt;
// wrapper procedure to not use global variables.
var
A,B: array[0..10] of Single4;
i : integer;
begin
for i:=low(a) to high(a) do
a[i].asvec+=b[i].asvec;
end;
begin
tt;
end.
compile with -Sv -O4 -Opcoreavx -O
Then the 64-bit code looks like:
.Lj5:
addl $1,%edx
# [16] a[i].asvec+=b[i].asvec;
movl %edx,%eax
shlq $4,%rax
movl %edx,%ecx
shlq $4,%rcx
movdqa (%rsp,%rax),%xmm0
addps 176(%rsp,%rcx),%xmm0
movl %edx,%eax
shlq $4,%rax
cmpl $10,%edx
jnge .Lj5
# [17] end;
64-bits archs usually have 16-byte alignment built in. The loop looks horrible and shows PFC lacks strength reduction.