### Author Topic: SSE ASM Function is less accurate than pascal code, why ? and or How to ?  (Read 803 times)

#### BeanzMaster

• New member
• Posts: 26
##### SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« on: September 13, 2017, 12:35:41 am »
Hi to all, I've made some little tests with asm sse functions. It's work, but is less speed than a native pascal code

This the code :

Code: Pascal  [Select]
1. Const
2.   _iX : byte = 0;
3.   _iY : byte = 1;
4.   _iZ : byte = 2;
5.   _iW : byte = 3;
6.
7. Type
8.   TGLZVector = Packed Record
9.     Case Integer Of
10.       0: (V: Array[0..3] Of Single);
11.       1: (X, Y, Z, W: Single);
12.   End;
13.
14. function asm_sse_VectorAdd( V1, V2: TGLZVector):TGLZVector;assembler;
15. asm
16.   movaps xmm0,[RDX] //[V1]
17.   movups xmm1,[R8] //[V2]
19.   movups [ECX], xmm0  //[result]
20. End;
21.
22. procedure asm_sse_AddVector( V1, V2: TGLZVector; var V:TGLZVector); assembler;register;
23. asm
24.   movaps xmm0,[V1]
25.   movaps xmm1,[V2]
27.   movaps [V], XMM0
28. end;
29.
30. function nc_VectorAdd( AVector, AVector2: TGLZVector):TGLZVector;
31. begin
32.  result.v[_iX]:=AVector.v[_iX] + AVector2.v[_iX];
33.  result.v[_iY]:=AVector.v[_iY] + AVector2.v[_iY];
34.  result.v[_iZ]:=AVector.v[_iZ] + AVector2.v[_iZ];
35.  result.v[_iW] :=cZero;
36. end;
37.
38. procedure nc_AddVector( AVector, AVector2: TGLZVector;Var V:TGLZVector);
39. begin
40.  V.v[_iX]:=AVector.v[_iX] + AVector2.v[_iX];
41.  V.v[_iY]:=AVector.v[_iY] + AVector2.v[_iY];
42.  V.v[_iZ]:=AVector.v[_iZ] + AVector2.v[_iZ];
43.  V.v[_iW] :=cZero;
44. end;

The results (a loop of 1000 additions)

The output in Asm view
Quote
asm
0000000100186370 55                          push   %rbp
0000000100186371 4889e5                   mov    %rsp,%rbp
..\..\..\source\GLZVectorMath.pas:244     movaps xmm0,[RDX]
0000000100186374 0f2802                   movaps (%rdx),%xmm0
..\..\..\source\GLZVectorMath.pas:245     movups xmm1,[R8]
0000000100186377 410f1008                 movups (%r8),%xmm1
..\..\..\source\GLZVectorMath.pas:247     movups [ECX], xmm0//[result],xmm0 //RCX
000000010018637E 0f1101                   movups %xmm0,(%rcx)
..\..\..\source\GLZVectorMath.pas:248     End;
0000000100186381 488d6500                 lea    0x0(%rbp),%rsp
0000000100186385 5d                       pop    %rbp
0000000100186386 c3                       retq

And this for the native code :

Quote
..\..\..\source\GLZVectorMath.pas:728     begin
00000001001866B0 488d6424c8               lea    -0x38(%rsp),%rsp
00000001001866B5 4889c8                   mov    %rcx,%rax
00000001001866B8 48891424                 mov    %rdx,(%rsp)
00000001001866BC 4c89442408               mov    %r8,0x8(%rsp)
00000001001866C1 488b0c24                 mov    (%rsp),%rcx
00000001001866C5 488b11                   mov    (%rcx),%rdx
00000001001866C8 4889542410               mov    %rdx,0x10(%rsp)
00000001001866CD 488b5108                 mov    0x8(%rcx),%rdx
00000001001866D1 4889542418               mov    %rdx,0x18(%rsp)
00000001001866D6 488b4c2408               mov    0x8(%rsp),%rcx
00000001001866DB 488b11                   mov    (%rcx),%rdx
00000001001866DE 4889542420               mov    %rdx,0x20(%rsp)
00000001001866E3 488b5108                 mov    0x8(%rcx),%rdx
00000001001866E7 4889542428               mov    %rdx,0x28(%rsp)
..\..\..\source\GLZVectorMath.pas:729     result.v[_iX]:=AVector.v[_iX] + AVector2.v[_iX];
00000001001866EC 0fb60dfd5d0200           movzbl 0x25dfd(%rip),%ecx        # 0x1001ac4f0 <TC_\$GLZMATH_\$\$_CZERO+16>
00000001001866F3 0fb615f65d0200           movzbl 0x25df6(%rip),%edx        # 0x1001ac4f0 <TC_\$GLZMATH_\$\$_CZERO+16>
00000001001866FA f30f10448c10             movss  0x10(%rsp,%rcx,4),%xmm0
0000000100186706 0fb615e35d0200           movzbl 0x25de3(%rip),%edx        # 0x1001ac4f0 <TC_\$GLZMATH_\$\$_CZERO+16>
000000010018670D f30f110490               movss  %xmm0,(%rax,%rdx,4)
..\..\..\source\GLZVectorMath.pas:730     result.v[_iY]:=AVector.v[_iY] + AVector2.v[_iY];
0000000100186712 0fb615e75d0200           movzbl 0x25de7(%rip),%edx        # 0x1001ac500 <TC_\$GLZMATH_\$\$_CZERO+32>
0000000100186719 0fb60de05d0200           movzbl 0x25de0(%rip),%ecx        # 0x1001ac500 <TC_\$GLZMATH_\$\$_CZERO+32>
0000000100186720 f30f10449410             movss  0x10(%rsp,%rdx,4),%xmm0
000000010018672C 0fb615cd5d0200           movzbl 0x25dcd(%rip),%edx        # 0x1001ac500 <TC_\$GLZMATH_\$\$_CZERO+32>
0000000100186733 f30f110490               movss  %xmm0,(%rax,%rdx,4)
..\..\..\source\GLZVectorMath.pas:731     result.v[_iZ]:=AVector.v[_iZ] + AVector2.v[_iZ];
0000000100186738 0fb615d15d0200           movzbl 0x25dd1(%rip),%edx        # 0x1001ac510 <TC_\$GLZMATH_\$\$_CZERO+48>
000000010018673F 0fb60dca5d0200           movzbl 0x25dca(%rip),%ecx        # 0x1001ac510 <TC_\$GLZMATH_\$\$_CZERO+48>
0000000100186746 f30f10449410             movss  0x10(%rsp,%rdx,4),%xmm0
0000000100186752 0fb615b75d0200           movzbl 0x25db7(%rip),%edx        # 0x1001ac510 <TC_\$GLZMATH_\$\$_CZERO+48>
0000000100186759 f30f110490               movss  %xmm0,(%rax,%rdx,4)
..\..\..\source\GLZVectorMath.pas:732     result.v[_iW] :=cZero;
000000010018675E 0fb615bb5d0200           movzbl 0x25dbb(%rip),%edx        # 0x1001ac520 <TC_\$GLZMATH_\$\$_CZERO+64>
0000000100186765 8b0d755d0200             mov    0x25d75(%rip),%ecx        # 0x1001ac4e0 <TC_\$GLZMATH_\$\$_CZERO>
000000010018676B 890c90                   mov    %ecx,(%rax,%rdx,4)
..\..\..\source\GLZVectorMath.pas:733     end;
000000010018676E 488d642438               lea    0x38(%rsp),%rsp
0000000100186773 c30000000000000000000000 retq

So my question how to "ByPass" this, it's seem what decrease performance

0000000100186370 55                          push   %rbp
0000000100186371 4889e5                   mov    %rsp,%rbp

and

0000000100186385 5d                       pop    %rbp

How to do to optimize this.

If it's not possible  is it really interesting to use ASM / SSE with FPC

PS : Same results in debug mode and with my release options :

{\$IFDEF RELEASE}
{\$RANGECHECKS OFF}
{\$FPUTYPE SSE3}
{\$INLINE ON}
{\$ALIGN 32}
{\$OPTIMIZATION LOOPUNROLL,LEVEL3,UNCERTAIN,PEEPHOLE,ASMCSE,ORDERFIELDS,FASTMATH,CSE,DFA}
{\$ENDIF}

Thanks in advance. And sorry for my english

#### Laksen

• Hero Member
• Posts: 594
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #1 on: September 13, 2017, 12:50:29 am »
All of these are pass by value, which means you create a copy of them on the callee stack before passing a pointer to the new record when calling the procedure. That's the slowest part here for sure..

Quote
movups [ECX], xmm0
Clearly you were using 64bit registers before. Using a pointer to ECX will make it crash on a 64bit machine some day..

Quote
movups/movaps
The difference between aligned and unaligned is very significant. If your structures are aligned you should always use movaps. But if your structures aren't aligned to a 16byte boundary then you should use movups. And are you sure your stack structures are aligned at a 16byte boundary?

When using "Packed Record" special packing and alignment rules get inferred. You should study the manual about those.

But I really doubt we support any kind of way of making a simple vector 16byte aligned on the local stack...

#### BeanzMaster

• New member
• Posts: 26
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #2 on: September 13, 2017, 07:21:59 pm »
All of these are pass by value, which means you create a copy of them on the callee stack before passing a pointer to the new record when calling the procedure. That's the slowest part here for sure..

Quote
movups [ECX], xmm0
Clearly you were using 64bit registers before. Using a pointer to ECX will make it crash on a 64bit machine some day..

Yes clearly in 64bits
I didn't know i'll change to RCX
So how can assign SSE register directly without making copy ?

Quote
movups/movaps
The difference between aligned and unaligned is very significant. If your structures are aligned you should always use movaps. But if your structures aren't aligned to a 16byte boundary then you should use movups. And are you sure your stack structures are aligned at a 16byte boundary?

How or where i'll can check if stack is aligned or not. I tried to force with [c]{\$ALIGN 16}[/c]. No changes

Like you see in the code i posted it have 2 versions one by using direct Register and the other the variable name. I've also tried by adding NoStackFrame;  and Register; at the end of functions. No changes

When using "Packed Record" special packing and alignment rules get inferred. You should study the manual about those.

The size of TGLZVector is 128bits and is 16bit aligned boundary, not ?
I tried  {\$PACKRECORDS 16}    no changes.

Have you a link in the manual of FPC or Lazarus where find this informations ?

But I really doubt we support any kind of way of making a simple vector 16byte aligned on the local stack...
I don't understand. Why we could not create a vector aligned on the stack ?

• Hero Member
• Posts: 4442
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #3 on: September 13, 2017, 09:00:09 pm »
https://www.freepascal.org/docs-html/ref/refsu15.html
https://www.freepascal.org/docs-html/current/prog/progsu9.html#x16-150001.2.9
Note {\$codealign} also handles code on the stack...(local)
Note you may be on your own if you insist on using assembler.... As I understand it it is your responsibility to align the stack.
But Pascal code - so also vectors - will be aligned to the codealign settings.

Anyway: if you use assembler I trust that you also know how to align your stack entries....Otherwise don't bother. Because it is rather basic...
« Last Edit: September 13, 2017, 09:21:03 pm by Thaddy »
"Logically, no number of positive outcomes at the level of experimental testing can confirm a scientific theory, but a single counterexample is logically decisive."

#### engkin

• Hero Member
• Posts: 1626
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #4 on: September 14, 2017, 03:33:41 am »
So my question how to "ByPass" this, it's seem what decrease performance

0000000100186370 55                          push   %rbp
0000000100186371 4889e5                   mov    %rsp,%rbp

and

0000000100186385 5d                       pop    %rbp

How to do to optimize this.

Try this code:
Code: Pascal  [Select]
2. asm
3.   movaps xmm0,[RDX] //[V1]
4.   movups xmm1,[R8] //[V2]
6.   movups [RCX], xmm0  //[result]
7. End;

#### Akira1364

• Sr. Member
• Posts: 300
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #5 on: September 14, 2017, 04:26:38 am »
Why are you assuming that your handwritten SSE-tuned ASM should be faster than what FPC produces natively from Pascal code? On top of that do you realize that if you implement a method purely in ASM, targeting a specific instruction set and nothing else, you're essentially crippling the performance of the application for anyone who has a CPU capable of using instructions more recent than the ones you used?

For example, my CPU supports up to AVX2. Why exactly do you suppose I would possibly want to run your SSE ASM version (SSE came out in 1999 on the Pentium 3, remember!) when I could simply set the appropriate FPC compiler flags for AVX2 along with O3 or O4 optimization, compile the Pascal implementation, and have a result that will be much, much, much, much, much faster than your version?

On a more general note, what is it with the Object Pascal development community and SSE/SSE2? Why do people continue to believe that that's the be-all-end-all of optimization? What year do you all think it is?!
« Last Edit: September 14, 2017, 04:17:53 pm by Akira1364 »

#### engkin

• Hero Member
• Posts: 1626
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #6 on: September 14, 2017, 05:41:57 am »
Why are you assuming that your handwritten SSE-tuned ASM should be faster than what FPC produces natively from Pascal code?
It seems to me that you are assuming that we are assuming. We are not.

On top of that do you realize that if you implement a method purely in ASM, targeting a specific instruction set and nothing else, you're essentially crippling the performance of the application for anyone who has a CPU capable of using instructions more recent than the ones you used?
Why are you assuming that our code would target one specific instruction set?

For example, my CPU supports up to AVX2. Why exactly do you suppose I would possibly want to run your SSE ASM version (SSE came out in 1999 on the Pentium 3, remember!) when I could simply set the appropriate FPC compiler flags for AVX2, compile the Pascal implementation, and have a result that will perform much, much, much, much, much faster than your version?
What if some CPU does not support AVX2? plus you had that argument against using assembly recently and here is Marco's answer. Maybe missed it, or forgot it.

On a more general note, what is it with the Object Pascal development community and SSE/SSE2? Why do people continue to believe that that's the be-all-end-all of optimization? What year do you all think it is?!
Again, why are you assuming that? who said SSE* is the be-all-end-all of optimization!!

#### Akira1364

• Sr. Member
• Posts: 300
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #7 on: September 14, 2017, 06:02:39 am »
BeanzMaster is Jerome Delauney, maintainer of the Lazarus port of GLScene, is he not? I'm assuming this post probably has something to do with the assembly versions of the methods in GLVectorGeometry.pas. Fantastic library, and I really appreciate the work he does on keeping it compatible with Lazarus, but I definitely always set GEOMETRY_NO_ASM in my GLScene.inc...

He seemed objectively surprised that his handwritten ASM was not faster than the compiled Pascal version, as though it were something that should be surprising in general. (It's not. Almost nobody is as clever as they think they are! Myself included.)

Literally any time you handwrite assembly code, you must be targeting a specific instruction set, or at least not going "above" a certain one.

You obviously didn't understand the gist of what I was saying, which is that you should just write everything in the best Pascal you're capable of and allow people to specify the CPU optimization flags that are specifically relevant to them when compiling. And yes, I read Marcos response. It didn't really mean anything groundbreaking. Plus I think he was ultimately agreeing with me that the compiler generally produces better ASM than ASM handwritten by people?

I'm not the one making any assumptions (other than the one in regards to this thread being about GLScene code). Pascal programmers being obsessed with SSE and SSE2 ASM implementations of methods is just "a thing", and has been for years. Maybe you're too young to be aware of that? (And I don't mean that in any kind of insulting way... just in the sense that you may have literally not been around long enough to recognize it as as consistent trend.)
« Last Edit: September 14, 2017, 04:19:55 pm by Akira1364 »

#### engkin

• Hero Member
• Posts: 1626
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #8 on: September 14, 2017, 06:42:23 am »
BeanzMaster is Jerome Delauney, maintainer of the Lazarus port of GLScene, is he not? I'm assuming this post probably has something to do with the assembly versions of the methods in GLVectorGeometry.pas. Fantastic library, and I really appreciate the work he does on keeping it compatible with Lazarus, but I definitely always set GEOMETRY_NO_ASM in my GLScene.inc...
As you can see he did not target one instruction set and gave you the option.
At least in this specific example he refuted one of your assumptions.

He seemed objectively surprised that his handwritten ASM was not faster than the compiled Pascal version, as though it were something that should be surprising in general. (It's not. Almost nobody is as clever as they think they are! Myself included.)
He also seemed to be surprised that FPC added some useless code that he believes is a possible reason to reduce the speed of his handwritten assembly code. That's why he started this thread, asking us how to get rid of that code. Laksen tried to explain that "The difference between aligned and unaligned is very significant.". Then the alignment on the stack became an obvious issue.

Literally any time you handwrite assembly code, you must be targeting a specific instruction set, or at least not going "above" a certain one.
Usually you check what the CPU supports and change the path of code based on that. Since SSE is pretty old, it makes perfect sense to start with as it covers a wide range of CPUs.

You obviously didn't understand the gist of what I was saying, which is that you should just write everything in the best Pascal you're capable of and allow people to specify the CPU optimization flags that are specifically relevant to them when compiling. And yes, I read Marcos response. It didn't really mean anything groundbreaking. Plus I think he was ultimately agreeing with me that the compiler generally produces better ASM than ASM handwritten by people?
Let me quote this part here: "one should always have both implementations and compare"
We have an impressive member here, BeRo, you might want to see if he uses any assembly in his code. I am sure if you poked around the RTL you'll notice some handwritten ASM, do you think it is time to replace it with pure pascal?

#### BeanzMaster

• New member
• Posts: 26
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #9 on: September 14, 2017, 10:41:43 am »
Hi to all
BeanzMaster is Jerome Delauney, maintainer of the Lazarus port of GLScene, is he not? I'm assuming this post probably has something to do with the assembly versions of the methods in GLVectorGeometry.pas. Fantastic library, and I really appreciate the work he does on keeping it compatible with Lazarus, but I definitely always set GEOMETRY_NO_ASM in my GLScene.inc...
Your right Akira i make some tests for replacing "deprecated" asm functions in VectorGeometry and math units

https://www.freepascal.org/docs-html/ref/refsu15.html
https://www.freepascal.org/docs-html/current/prog/progsu9.html#x16-150001.2.9
Note {\$codealign} also handles code on the stack...(local)
Note you may be on your own if you insist on using assembler.... As I understand it it is your responsibility to align the stack.
But Pascal code - so also vectors - will be aligned to the codealign settings.

Anyway: if you use assembler I trust that you also know how to align your stack entries....Otherwise don't bother. Because it is rather basic...

Thanks i tried {\$codealign} and the result is a little bit better. But sometime (don't say why in 1.8rc4). The compiler return "Error: Illegal alignment directive".
Also tried with an Array(0..3] of signle, tried with Pointer (PSingle), no changes  Anyway...

Try this code:
Code: Pascal  [Select]
2. asm
3.   movaps xmm0,[RDX] //[V1]
4.   movups xmm1,[R8] //[V2]
6.   movups [RCX], xmm0  //[result]
7. End;

Same results

After a few readings. It appears that :
• The FPC compiler work differently than Delphi with the stack.
• It's seems to be stay a problem between the {\$align} and {\$packedrecord} directives (not sure about that)
• The use of asm with SIMD instructions for small functions like the one posted here. Not recommended. The generated FPC code is better (due to its stack management)
On the other hand, in the case of batch data processing. Writing of a specific asm SSE code for improving performance is possible.

Conclusion, I will keep the code in 'pure pascal'.I think, perhaps, i'll take look later on the side of Cuda/OpenCL (i must do some research on this subject)
I will try an SSE approach during treatmentsof data per batch for some case

It would be nice to compare with Delphi, if in this one the performances are improved or not.

Small question if i compile with AVX options, if i run on a pc that does not have AVX instructions. Does the program switch to SSE instructions, or an error appear ?

Thanks to all

« Last Edit: September 14, 2017, 10:54:41 am by BeanzMaster »

• Hero Member
• Posts: 4442
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #10 on: September 14, 2017, 12:30:00 pm »
No. If you explicitly set AVX it will generate AVX code, not SSE(X).
"Logically, no number of positive outcomes at the level of experimental testing can confirm a scientific theory, but a single counterexample is logically decisive."

#### marcov

• Global Moderator
• Hero Member
• Posts: 5646
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #11 on: September 14, 2017, 01:01:20 pm »
Quote
After a few readings. It appears that :
• The FPC compiler work differently than Delphi with the stack.
• It's seems to be stay a problem between the {\$align} and {\$packedrecord} directives (not sure about that)
• The use of asm with SIMD instructions for small functions like the one posted here. Not recommended. The generated FPC code is better (due to its stack management)
On the other hand, in the case of batch data processing. Writing of a specific asm SSE code for improving performance is possible.

• The first point is way to generic. It might be a minor detail or an accidentally exploited Delphi bug. To say that FPC works differently is then a bit to broad.
• I also notice that you pass complex structures (records) to functions, and then assume that they are by reference. Use CONST, or better constref to force that.
• If you don't want a stackframe, flag the function with nostackframe directive
• trying to do sse with relative short pieces of code is always dangerous and difficult to tune to multiple compilers
• for best performance, find the heaviest uses in loops with preferably relatively short bodies, and convert those to assembler as a whole

Quote
Conclusion, I will keep the code in 'pure pascal'.I think, perhaps, i'll take look later on the side of Cuda/OpenCL (i must do some research on this subject)
I will try an SSE approach during treatmentsof data per batch for some case

(CUDA is complicated and still emerging. However the even bigger problem is that at least for me transfering data to and from the GPU was already in the magnitude of the calculations, making it slower over all (image conversions))

Quote
Small question if i compile with AVX options, if i run on a pc that does not have AVX instructions. Does the program switch to SSE instructions, or an error appear ?

No, it will crash. However you can add both sse and avx routines, run a detection, and then set a procvar to the desired routine (or have a DLL in both sse and avx(2) variant).

Note avx is most floating point (Ivy bridge(3xxx), maybe sandy bridge(2xxx) too), and avx2 adds the corresponding integer instructions. (haswel+ (4xxx or newer)).

#### BeanzMaster

• New member
• Posts: 26
##### Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #12 on: September 14, 2017, 01:18:23 pm »
No. If you explicitly set AVX it will generate AVX code, not SSE(X).
No, it will crash. However you can add both sse and avx routines, run a detection, and then set a procvar to the desired routine (or have a DLL in both sse and avx(2) variant).

Note avx is most floating point (Ivy bridge(3xxx), maybe sandy bridge(2xxx) too), and avx2 adds the corresponding integer instructions. (haswel+ (4xxx or newer)).

Ok Thanks

• I also notice that you pass complex structures (records) to functions, and then assume that they are by reference. Use CONST, or better constref to force that.
• If you don't want a stackframe, flag the function with nostackframe directive
• trying to do sse with relative short pieces of code is always dangerous and difficult to tune to multiple compilers
• for best performance, find the heaviest uses in loops with preferably relatively short bodies, and convert those to assembler as a whole
• Use CONST, or better constref to force that : no differences
• for best performance, find the heaviest uses in loops with preferably relatively short bodies, and convert those to assembler as a whole : It's seems the good way

(CUDA is complicated and still emerging. However the even bigger problem is that at least for me transfering data to and from the GPU was already in the magnitude of the calculations, making it slower over all (image conversions))

So i've converted this Original Delphi code from Eric Grange : https://www.delphitools.info/2011/03/24/kudos-to-the-firefox-4-tracemonkey-team/ for testing alignment (under windows 10 and with  Lazarus 1.8rc4 64bit)

Code: Pascal  [Select]
1. Unit Unit1;
2.
3. // Original Delphi code from Eric Grange : https://www.delphitools.info/2011/03/24/kudos-to-the-firefox-4-tracemonkey-team/
4. {\$mode objfpc}{\$H+}
5. {.\$codealign 16}  // ---> unit1.pas(4,2) Error: Illegal alignment directive
6. {\$Align 16}
7.
8. Interface
9.
10. Uses
11.   Classes, Sysutils, Windows, Fileutil, Forms, Controls, Graphics, Dialogs, ExtCtrls, StdCtrls,
12.   math;
13.
14. Type
15.
16.   { Tform1 }
17.
18.   Tform1 = Class(Tform)
19.     Button1 : Tbutton;
20.     Checkbox1 : Tcheckbox;
21.     Image1 : Timage;
22.     Label1 : Tlabel;
23.     Panel1 : Tpanel;
24.     Panel2 : Tpanel;
25.     Shape1 : Tshape;
26.     Procedure Button1click(Sender : Tobject);
27.     Procedure Checkbox1click(Sender : Tobject);
28.     Procedure Formcreate(Sender : Tobject);
29.     Procedure Image1mousedown(Sender : Tobject; Button : Tmousebutton; Shift : Tshiftstate; X, Y : Integer);
30.     Procedure Image1mousemove(Sender : Tobject; Shift : Tshiftstate; X, Y : Integer);
31.     Procedure Image1mouseup(Sender : Tobject; Button : Tmousebutton; Shift : Tshiftstate; X, Y : Integer);
32.   Private
33.
34.   Public
35.     FBitmap : TBitmap;
36.     FScanLines : array of PInteger;
37.
38.     iterLimit : Integer;
39.     qmin, qmax, pmin, pmax : Double;
40.     controlColors : array of TColor;
41.     colors : array of TColor;
42.     mbX, mbY : Integer;
43.
44.     procedure ResetMandel;
45.
46.     procedure ResetControlColors;
47.     procedure ComputeColors;
48.
49.     procedure ComputeMandel;
50.     procedure ComputeMandelDelphi;
51.     procedure ComputeMandelSSE;
52.
53.     procedure DrawPixel(x, y, c : Integer);
54.
55.   End;
56.
57. Var
58.   Form1 : Tform1;
59.
60. Implementation
61.
62. {\$R *.lfm}
63.
64. const
65.    MAX_COLORS = 512;
66.
67. Procedure Tform1.Formcreate(Sender : Tobject);
68. begin
69. (*   FBitmap:=TBitmap.Create;
70.    FBitmap.PixelFormat:=pf32bit;
71.    FBitmap.SetSize(Image1.Width, Image1.Height);
72.      SetLength(FScanLines, FBitmap.Height);
73.    for i:=0 to FBitmap.Height-1 do
74.       FScanLines[i]:=PInteger(FBitmap.RawImage.GetLineStart(i)); *)
75.    ResetMandel;
76.    ResetControlColors;
77.    ComputeColors;
78.    //Label1.Caption:='Double Size : '+InttoStr(SizeOf(Double)); //give 8 so 8x8bits = 64bits
79.  //  ComputeMandel;
80. End;
81.
82. Procedure Tform1.Button1click(Sender : Tobject);
83. Begin
84.    ResetMandel;
85.    ComputeMandel;
86. End;
87.
88. Procedure Tform1.Checkbox1click(Sender : Tobject);
89. Begin
90.    ComputeMandel;
91. End;
92.
93. Procedure Tform1.Image1mousedown(Sender : Tobject; Button : Tmousebutton; Shift : Tshiftstate; X, Y : Integer);
94. Begin
95.    mbX:=X;
96.    mbY:=Y;
97. End;
98.
99. Procedure Tform1.Image1mousemove(Sender : Tobject; Shift : Tshiftstate; X, Y : Integer);
100. var
101.    s : Integer;
102. begin
103.    if ssLeft in Shift then begin
104.       s := Max(X-mbX, Y-mbY);
105.       if s>0 then begin
106.          Shape1.SetBounds(mbX+Image1.Left, mbY+Image1.Top, s, s);
107.          Shape1.Visible:=True;
108.       end;
109.    end;
110. End;
111.
112. Procedure Tform1.Image1mouseup(Sender : Tobject; Button : Tmousebutton; Shift : Tshiftstate; X, Y : Integer);
113. var
114.    s : Integer;
115.    pw, qw : Double;
116. begin
117.    Shape1.Visible:=False;
118.
119.    s:=Max(X-mbX, Y-mbY);
120.    if (s>3) then
121.    begin
122.       X := mbX + s;
123.       Y := mbY + s;
124.       pw := pmax - pmin;
125.       pmin := pmin + mbX * pw / FBitmap.Width;
126.       pmax := pmax - (FBitmap.Width - X) * pw / FBitmap.Width;
127.       qw := qmax - qmin;
128.       qmin := qmin + (FBitmap.Height - Y) * qw / FBitmap.Height;
129.       qmax := qmax - mbY * qw / FBitmap.Height;
130.
131.       ComputeMandel;
132.    end;
133. End;
134.
135. procedure TForm1.ResetMandel;
136. begin
137.    iterLimit := 100;
138.    qmin := -1.5;
139.    qmax := 1.5;
140.    pmin := -2.25;
141.    pmax := 0.75;
142. end;
143.
144. // ResetControlColors
145. //
146. procedure TForm1.ResetControlColors;
147. begin
148.    SetLength(controlColors, 5);
149.
150.    controlColors[0] := RGBToColor(\$20, \$00, \$00);
151.    controlColors[1] := RGBToColor(\$ff, \$ff, \$ff);
152.    controlColors[2] := RGBToColor(\$A0, \$00, \$00);
153.    controlColors[3] := RGBToColor(\$FF, \$ff, \$40);
154.    controlColors[4] := RGBToColor(\$FF, \$20, \$20);
155. end;
156.
157. // ComputeMandel
158. //
159. procedure TForm1.ComputeMandel;
160. var
161.    start, stop, freq : Int64;
162.    i:Integer;
163. begin
164.    // We must reset FBitmap to take change in Image1
165.    if Assigned(FBitmap) then FreeAndNil(FBitmap);
166.    FBitmap:=TBitmap.Create;
167.    FBitmap.PixelFormat:=pf32bit;
168.    FBitmap.SetSize(Image1.Width, Image1.Height);
169.      SetLength(FScanLines, FBitmap.Height);
170.    for i:=0 to FBitmap.Height-1 do
171.       FScanLines[i]:=PInteger(FBitmap.RawImage.GetLineStart(i));
172.
173.    Start:=0; Stop:=0; freq:=1;
174.    QueryPerformanceCounter(start);
175.    FBitmap.BeginUpdate();
176.    if CheckBox1.Checked then ComputeMandelSSE
177.    else ComputeMandelDelphi;
178.    FBitmap.EndUpdate();
179.    QueryPerformanceCounter(stop);
180.    QueryPerformanceFrequency(freq);
181.
182.    Image1.Picture.Bitmap := FBitmap;
183.
184.    Label1.Caption:='Generate in '+Format('%.1f milliseconds', [(stop-start)/freq*1000]);
185. end;
186.
187. // ComputeMandelDelphi
188. //
189. procedure TForm1.ComputeMandelDelphi;
190. const
191.    kmax = 256;
192. var
193.    xstep, ystep : Double;
194.    x, y, r : Double;
195.    sx, sy, k : Integer;
196.    p, q, x0, y0 : Double;
197. begin
198.    xstep := (pmax - pmin) / FBitmap.Width;
199.    ystep := (qmax - qmin) / FBitmap.Height;
200.
201.    for sx := 0 to FBitmap.Width-1 do begin
202.       for sy := 0 to FBitmap.Height-1 do begin
203.
204.          p := pmin + xstep * sx;
205.          q := qmax - ystep * sy;
206.          k := 0;
207.          x0 := 0;
208.          y0 := 0;
209.
210.          repeat
211.             x := x0 * x0 - y0 * y0 + p;
212.             y := 2 * x0 * y0 + q;
213.             x0 := x;
214.             y0 := y;
215.             r := x * x + y * y;
216.             Inc(k);
217.          until ((r > iterLimit) or (k >= kmax));
218.
219.          if k >= kmax then
220.             k := 0;
221.
222.          DrawPixel(sx, sy, k);
223.       end;
224.    end;
225. end;
226.
227. // From https://github.com/UltraStar-Deluxe/USDX/blob/master/src/base/UCommon.pas
228. type
229.   // stores the unaligned pointer of data allocated by GetAlignedMem()
232.
233. (**
234.  * Use this function to assure that allocated memory is aligned on a specific
235.  * byte boundary.
236.  * Alignment must be a power of 2.
237.  *
238.  * Important: Memory allocated with GetAlignedMem() MUST be freed with
239.  * FreeAlignedMem(), FreeMem() will cause a segmentation fault.
240.  *
241.  * Hint: If you do not need dynamic memory, consider to allocate memory
242.  * statically and use the {\$ALIGN x} compiler directive. Note that delphi
243.  * supports an alignment "x" of up to 8 bytes only whereas FPC supports
244.  * alignments on 16 and 32 byte boundaries too.
245.  *)
246. {\$WARNINGS OFF}
247. function GetAlignedMem(Size: cardinal; Alignment: integer): pointer;
248. var
249.   OrigPtr: pointer;
250. const
251.   MIN_ALIGNMENT = 16;
252. begin
253.   // Delphi and FPC (tested with 2.2.0) align memory blocks allocated with
254.   // GetMem() at least on 8 byte boundaries. Delphi uses a minimal alignment
255.   // of either 8 or 16 bytes depending on the size of the requested block
256.   // (see System.GetMinimumBlockAlignment). As we do not want to change the
257.   // boundary for the worse, we align at least on MIN_ALIGN.
258.   if (Alignment < MIN_ALIGNMENT) then
259.     Alignment := MIN_ALIGNMENT;
260.
261.   // allocate unaligned memory
262.   GetMem(OrigPtr, SizeOf(TMemAlignHeader) + Size + Alignment);
263.   if (OrigPtr = nil) then
264.   begin
265.     Result := nil;
266.     Exit;
267.   end;
268.
269.   // reserve space for the header
270.   Result := pointer(PtrUInt(OrigPtr) + SizeOf(TMemAlignHeader));
271.   // align memory
272.   Result := pointer(PtrUInt(Result) + Alignment - PtrUInt(Result) mod Alignment);
273.
276. end;
277. {\$WARNINGS ON}
278.
279. {\$WARNINGS OFF}
280. procedure FreeAlignedMem(P: pointer);
281. begin
282.   if (P <> nil) then
284. end;
285. {\$WARNINGS ON}
286.
287.
288. // ComputeMandelSSE
289. //
290. procedure TForm1.ComputeMandelSSE;
291. const
292.    kmax = 256;
293.   //c2 : Double = 2.0; //By using this const a SIGSEV is throw at : movsd xmm4, c2 /  mulsd xmm1,c2
294. var
295.    xstep, ystep : Double;
296.    r : Double;
297.    sx, sy, k : Integer;
298.    p, q, x0, y0 : Double;
299.    c2 : Double;
300.    _p, _q, _x0, _y0, _c2,_r : PDouble;
301.    AlignedDoubleSize : Cardinal;
302. begin
303.    c2 := 2.0;
304.   // AlignedDoubleSize := 2*Sizeof(Double); //128bits
305.   // _p := PDouble(GetAlignedMem(AlignedDoubleSize,16));
306.  //  _q := PDouble(GetAlignedMem(AlignedDoubleSize,16));
307.  //  _x0 := PDouble(GetAlignedMem(AlignedDoubleSize,16));
308.  //  _y0 := PDouble(GetAlignedMem(AlignedDoubleSize,16));
309.   //  _r := PDouble(GetAlignedMem(AlignedDoubleSize,16));
310.   // _c2 := PDouble(GetAlignedMem(AlignedDoubleSize,16));
311.  //  _c2^:= c2; inc(_c2); _x0^:=c2;
312.
313.    xstep := (pmax - pmin) / FBitmap.Width;
314.    ystep := (qmax - qmin) / FBitmap.Height;
315.
316.    for sx := 0 to FBitmap.Width-1 do
317.    begin
318.       for sy := 0 to FBitmap.Height-1 do
319.       begin
320.          p := pmin + xstep * sx;
321.          q := qmax - ystep * sy;
322.          k := 0;
323.          x0 := 0;
324.          y0 := 0;
325.       //   _p^:= p; inc(_p); _p^:=p;
326.       //   _q^:= p; inc(_q); _q^:=p;
327.       //   _x0^:= p; inc(_x0); _x0^:=p;
328.       //   _y0^:= p; inc(_y0); _y0^:=p;
329.          asm
330.             movsd xmm0, _x0;
331.             movsd xmm1, _y0;
332.           //  movsd xmm4, c2
333.          end;
334.
335.          repeat
336.             asm
337.                // x := x0 * x0 - y0 * y0 + p;
338.                movsd xmm2, xmm0
339.                mulsd xmm2, xmm2
340.                movsd xmm3, xmm1
341.                mulsd xmm3, xmm3
342.                subsd xmm2, xmm3
343.                movsd xmm4, p
345.                // y := 2 * x0 * y0 + q;
346.                // y0 :=y
347.                mulsd xmm1, xmm0
348.                movsd xmm4, c2
349.                mulsd xmm1, xmm4
350.                movsd xmm4, q
352.                // x0 := x
353.                movsd xmm0, xmm2
354.                // r := x * x + y * y;
355.                mulsd xmm2, xmm2
356.                movsd xmm3, xmm1
357.                mulsd xmm3, xmm1
359.                movsd r, xmm2
360.             end;
361.             Inc(k);
362.          until ((r > iterLimit) or (k >= kmax));
363.
364.          if k >= kmax then k := 0;
365.
366.          DrawPixel(sx, sy, k);
367.       end;
368.    end;
369. //  FreeAlignedMem(_p);
370. //  FreeAlignedMem(_q);
371. //  FreeAlignedMem(_x0);
372. //  FreeAlignedMem(_y0);
373.  // FreeAlignedMem(_r);
374.  // FreeAlignedMem(_c2);
375. end;
376.
377. // ComputeColors
378. //
379.
380.
381. procedure TForm1.ComputeColors;
382. var
383.    i, k : Integer;
384.    rstep, bstep, gstep : Double;
385. begin
386.    SetLength(colors, MAX_COLORS);
387.
388.    colors[0] := RGB(0, 0, 0);
389.
390.    for i:=0 to High(controlColors) do begin
391.       rstep := (GetRValue(controlColors[i + 1]) - GetRValue(controlColors[i])) / 63;
392.       gstep := (GetGValue(controlColors[i + 1]) - GetGValue(controlColors[i])) / 63;
393.       bstep := (GetBValue(controlColors[i + 1]) - GetBValue(controlColors[i])) / 63;
394.
395.       for k:=0 to 63 do
396.          colors[k + (i * 64) + 1] := RGB(Round(GetRValue(controlColors[i]) + rstep * k),
397.                                          Round(GetGValue(controlColors[i]) + gstep * k),
398.                                          Round(GetBValue(controlColors[i]) + bstep * k));
399.    end;
400.
401.     for i := 257 to MAX_COLORS-1 do
402.        colors[i] := colors[i - 256];
403. end;
404.
405. // DrawPixel
406. //
407. procedure TForm1.DrawPixel(x, y, c : Integer); inline;
408. begin
409.    PInteger(FScanLines[y]+(x))^:= colors[c];
410. end;
411.
412.
413. End.
414.
415.

There are compiler's warning (same with GetAlignedMem) and {\$align} seems to not change something

Quote
Compilation du projet - Cible : project1.exe : Succès - Avertissements : 6
unit1.pas(325,25) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(326,25) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(338,28) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(343,28) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(346,28) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(356,25) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"

And 2 Screenshoots : 1st Without SSE 2nd / With SSE

And like we see SSE produce bad result

Have you some clues, for i'm understanding what i'm forgot, with "Data Alignment" scheme ?

Best regards