* * *

Author Topic: SSE ASM Function is less accurate than pascal code, why ? and or How to ?  (Read 1387 times)

BeanzMaster

  • Jr. Member
  • **
  • Posts: 69
Hi to all, I've made some little tests with asm sse functions. It's work, but is less speed than a native pascal code

This the code :

Code: Pascal  [Select]
  1. Const
  2.   _iX : byte = 0;
  3.   _iY : byte = 1;
  4.   _iZ : byte = 2;
  5.   _iW : byte = 3;
  6.  
  7. Type
  8.   TGLZVector = Packed Record
  9.     Case Integer Of
  10.       0: (V: Array[0..3] Of Single);
  11.       1: (X, Y, Z, W: Single);
  12.   End;
  13.  
  14. function asm_sse_VectorAdd( V1, V2: TGLZVector):TGLZVector;assembler;
  15. asm
  16.   movaps xmm0,[RDX] //[V1]
  17.   movups xmm1,[R8] //[V2]
  18.   addps xmm0,xmm1
  19.   movups [ECX], xmm0  //[result]
  20. End;
  21.  
  22. procedure asm_sse_AddVector( V1, V2: TGLZVector; var V:TGLZVector); assembler;register;
  23. asm
  24.   movaps xmm0,[V1]
  25.   movaps xmm1,[V2]
  26.   addps xmm0,xmm1
  27.   movaps [V], XMM0
  28. end;
  29.  
  30. function nc_VectorAdd( AVector, AVector2: TGLZVector):TGLZVector;
  31. begin
  32.  result.v[_iX]:=AVector.v[_iX] + AVector2.v[_iX];
  33.  result.v[_iY]:=AVector.v[_iY] + AVector2.v[_iY];
  34.  result.v[_iZ]:=AVector.v[_iZ] + AVector2.v[_iZ];
  35.  result.v[_iW] :=cZero;
  36. end;
  37.  
  38. procedure nc_AddVector( AVector, AVector2: TGLZVector;Var V:TGLZVector);
  39. begin
  40.  V.v[_iX]:=AVector.v[_iX] + AVector2.v[_iX];
  41.  V.v[_iY]:=AVector.v[_iY] + AVector2.v[_iY];
  42.  V.v[_iZ]:=AVector.v[_iZ] + AVector2.v[_iZ];
  43.  V.v[_iW] :=cZero;
  44. end;  

The results (a loop of 1000 additions)

 - asm_sse_VectorAdd  =  9941.37904679775 µs
 - nc_VectorAdd           =   8732.70474082232 µs

 - asm_sse_AddVector =  2530.26766065706 µs
 - nc_AddVector          =  1835.35809546211 µs

The output in Asm view
Quote
asm
0000000100186370 55                          push   %rbp
0000000100186371 4889e5                   mov    %rsp,%rbp
     ..\..\..\source\GLZVectorMath.pas:244     movaps xmm0,[RDX]
0000000100186374 0f2802                   movaps (%rdx),%xmm0
      ..\..\..\source\GLZVectorMath.pas:245     movups xmm1,[R8]
0000000100186377 410f1008                 movups (%r8),%xmm1
     ..\..\..\source\GLZVectorMath.pas:246     addps xmm0,xmm1
000000010018637B 0f58c1                   addps  %xmm1,%xmm0
      ..\..\..\source\GLZVectorMath.pas:247     movups [ECX], xmm0//[result],xmm0 //RCX
000000010018637E 0f1101                   movups %xmm0,(%rcx)
      ..\..\..\source\GLZVectorMath.pas:248     End;
0000000100186381 488d6500                 lea    0x0(%rbp),%rsp
0000000100186385 5d                       pop    %rbp
0000000100186386 c3                       retq   

And this for the native code :

Quote
..\..\..\source\GLZVectorMath.pas:728     begin
00000001001866B0 488d6424c8               lea    -0x38(%rsp),%rsp
00000001001866B5 4889c8                   mov    %rcx,%rax
00000001001866B8 48891424                 mov    %rdx,(%rsp)
00000001001866BC 4c89442408               mov    %r8,0x8(%rsp)
00000001001866C1 488b0c24                 mov    (%rsp),%rcx
00000001001866C5 488b11                   mov    (%rcx),%rdx
00000001001866C8 4889542410               mov    %rdx,0x10(%rsp)
00000001001866CD 488b5108                 mov    0x8(%rcx),%rdx
00000001001866D1 4889542418               mov    %rdx,0x18(%rsp)
00000001001866D6 488b4c2408               mov    0x8(%rsp),%rcx
00000001001866DB 488b11                   mov    (%rcx),%rdx
00000001001866DE 4889542420               mov    %rdx,0x20(%rsp)
00000001001866E3 488b5108                 mov    0x8(%rcx),%rdx
00000001001866E7 4889542428               mov    %rdx,0x28(%rsp)
..\..\..\source\GLZVectorMath.pas:729     result.v[_iX]:=AVector.v[_iX] + AVector2.v[_iX];
00000001001866EC 0fb60dfd5d0200           movzbl 0x25dfd(%rip),%ecx        # 0x1001ac4f0 <TC_$GLZMATH_$$_CZERO+16>
00000001001866F3 0fb615f65d0200           movzbl 0x25df6(%rip),%edx        # 0x1001ac4f0 <TC_$GLZMATH_$$_CZERO+16>
00000001001866FA f30f10448c10             movss  0x10(%rsp,%rcx,4),%xmm0
0000000100186700 f30f58449420             addss  0x20(%rsp,%rdx,4),%xmm0
0000000100186706 0fb615e35d0200           movzbl 0x25de3(%rip),%edx        # 0x1001ac4f0 <TC_$GLZMATH_$$_CZERO+16>
000000010018670D f30f110490               movss  %xmm0,(%rax,%rdx,4)
..\..\..\source\GLZVectorMath.pas:730     result.v[_iY]:=AVector.v[_iY] + AVector2.v[_iY];
0000000100186712 0fb615e75d0200           movzbl 0x25de7(%rip),%edx        # 0x1001ac500 <TC_$GLZMATH_$$_CZERO+32>
0000000100186719 0fb60de05d0200           movzbl 0x25de0(%rip),%ecx        # 0x1001ac500 <TC_$GLZMATH_$$_CZERO+32>
0000000100186720 f30f10449410             movss  0x10(%rsp,%rdx,4),%xmm0
0000000100186726 f30f58448c20             addss  0x20(%rsp,%rcx,4),%xmm0
000000010018672C 0fb615cd5d0200           movzbl 0x25dcd(%rip),%edx        # 0x1001ac500 <TC_$GLZMATH_$$_CZERO+32>
0000000100186733 f30f110490               movss  %xmm0,(%rax,%rdx,4)
..\..\..\source\GLZVectorMath.pas:731     result.v[_iZ]:=AVector.v[_iZ] + AVector2.v[_iZ];
0000000100186738 0fb615d15d0200           movzbl 0x25dd1(%rip),%edx        # 0x1001ac510 <TC_$GLZMATH_$$_CZERO+48>
000000010018673F 0fb60dca5d0200           movzbl 0x25dca(%rip),%ecx        # 0x1001ac510 <TC_$GLZMATH_$$_CZERO+48>
0000000100186746 f30f10449410             movss  0x10(%rsp,%rdx,4),%xmm0
000000010018674C f30f58448c20             addss  0x20(%rsp,%rcx,4),%xmm0
0000000100186752 0fb615b75d0200           movzbl 0x25db7(%rip),%edx        # 0x1001ac510 <TC_$GLZMATH_$$_CZERO+48>
0000000100186759 f30f110490               movss  %xmm0,(%rax,%rdx,4)
..\..\..\source\GLZVectorMath.pas:732     result.v[_iW] :=cZero;
000000010018675E 0fb615bb5d0200           movzbl 0x25dbb(%rip),%edx        # 0x1001ac520 <TC_$GLZMATH_$$_CZERO+64>
0000000100186765 8b0d755d0200             mov    0x25d75(%rip),%ecx        # 0x1001ac4e0 <TC_$GLZMATH_$$_CZERO>
000000010018676B 890c90                   mov    %ecx,(%rax,%rdx,4)
..\..\..\source\GLZVectorMath.pas:733     end;
000000010018676E 488d642438               lea    0x38(%rsp),%rsp
0000000100186773 c30000000000000000000000 retq   


So my question how to "ByPass" this, it's seem what decrease performance

0000000100186370 55                          push   %rbp
0000000100186371 4889e5                   mov    %rsp,%rbp

and

0000000100186385 5d                       pop    %rbp

How to do to optimize this.

If it's not possible  is it really interesting to use ASM / SSE with FPC

PS : Same results in debug mode and with my release options :

 {$IFDEF RELEASE}
    {$RANGECHECKS OFF}
    {$FPUTYPE SSE3}
    {$INLINE ON}
    {$ALIGN 32}
    {$OPTIMIZATION LOOPUNROLL,LEVEL3,UNCERTAIN,PEEPHOLE,ASMCSE,ORDERFIELDS,FASTMATH,CSE,DFA}
  {$ENDIF}

Thanks in advance. And sorry for my english

Laksen

  • Hero Member
  • *****
  • Posts: 596
    • J-Software
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #1 on: September 13, 2017, 12:50:29 am »
All of these are pass by value, which means you create a copy of them on the callee stack before passing a pointer to the new record when calling the procedure. That's the slowest part here for sure..

Quote
movups [ECX], xmm0
Clearly you were using 64bit registers before. Using a pointer to ECX will make it crash on a 64bit machine some day..

Quote
movups/movaps
The difference between aligned and unaligned is very significant. If your structures are aligned you should always use movaps. But if your structures aren't aligned to a 16byte boundary then you should use movups. And are you sure your stack structures are aligned at a 16byte boundary?

When using "Packed Record" special packing and alignment rules get inferred. You should study the manual about those.

But I really doubt we support any kind of way of making a simple vector 16byte aligned on the local stack...

BeanzMaster

  • Jr. Member
  • **
  • Posts: 69
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #2 on: September 13, 2017, 07:21:59 pm »
All of these are pass by value, which means you create a copy of them on the callee stack before passing a pointer to the new record when calling the procedure. That's the slowest part here for sure..

Quote
movups [ECX], xmm0
Clearly you were using 64bit registers before. Using a pointer to ECX will make it crash on a 64bit machine some day..

Yes clearly in 64bits  :)
I didn't know i'll change to RCX
So how can assign SSE register directly without making copy ?

Quote
movups/movaps
The difference between aligned and unaligned is very significant. If your structures are aligned you should always use movaps. But if your structures aren't aligned to a 16byte boundary then you should use movups. And are you sure your stack structures are aligned at a 16byte boundary?

How or where i'll can check if stack is aligned or not. I tried to force with [c]{$ALIGN 16}[/c]. No changes

Like you see in the code i posted it have 2 versions one by using direct Register and the other the variable name. I've also tried by adding NoStackFrame;  and Register; at the end of functions. No changes

When using "Packed Record" special packing and alignment rules get inferred. You should study the manual about those.

The size of TGLZVector is 128bits and is 16bit aligned boundary, not ?
I tried  {$PACKRECORDS 16}    no changes.
             
Have you a link in the manual of FPC or Lazarus where find this informations ?

But I really doubt we support any kind of way of making a simple vector 16byte aligned on the local stack...
I don't understand. Why we could not create a vector aligned on the stack ?

Thaddy

  • Hero Member
  • *****
  • Posts: 4777
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #3 on: September 13, 2017, 09:00:09 pm »
https://www.freepascal.org/docs-html/ref/refsu15.html
https://www.freepascal.org/docs-html/current/prog/progsu9.html#x16-150001.2.9
Note {$codealign} also handles code on the stack...(local)
Note you may be on your own if you insist on using assembler.... As I understand it it is your responsibility to align the stack.
But Pascal code - so also vectors - will be aligned to the codealign settings.

Anyway: if you use assembler I trust that you also know how to align your stack entries....Otherwise don't bother. Because it is rather basic...
« Last Edit: September 13, 2017, 09:21:03 pm by Thaddy »
"Logically, no number of positive outcomes at the level of experimental testing can confirm a scientific theory, but a single counterexample is logically decisive."

engkin

  • Hero Member
  • *****
  • Posts: 1758
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #4 on: September 14, 2017, 03:33:41 am »
So my question how to "ByPass" this, it's seem what decrease performance

0000000100186370 55                          push   %rbp
0000000100186371 4889e5                   mov    %rsp,%rbp

and

0000000100186385 5d                       pop    %rbp

How to do to optimize this.

Try this code:
Code: Pascal  [Select]
  1. procedure asm_sse_VectorAdd_();assembler;
  2. asm
  3.   movaps xmm0,[RDX] //[V1]
  4.   movups xmm1,[R8] //[V2]
  5.   addps xmm0,xmm1
  6.   movups [RCX], xmm0  //[result]
  7. End;

Akira1364

  • Sr. Member
  • ****
  • Posts: 333
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #5 on: September 14, 2017, 04:26:38 am »
Why are you assuming that your handwritten SSE-tuned ASM should be faster than what FPC produces natively from Pascal code? On top of that do you realize that if you implement a method purely in ASM, targeting a specific instruction set and nothing else, you're essentially crippling the performance of the application for anyone who has a CPU capable of using instructions more recent than the ones you used?

For example, my CPU supports up to AVX2. Why exactly do you suppose I would possibly want to run your SSE ASM version (SSE came out in 1999 on the Pentium 3, remember!) when I could simply set the appropriate FPC compiler flags for AVX2 along with O3 or O4 optimization, compile the Pascal implementation, and have a result that will be much, much, much, much, much faster than your version?

On a more general note, what is it with the Object Pascal development community and SSE/SSE2? Why do people continue to believe that that's the be-all-end-all of optimization? What year do you all think it is?!
« Last Edit: September 14, 2017, 04:17:53 pm by Akira1364 »

engkin

  • Hero Member
  • *****
  • Posts: 1758
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #6 on: September 14, 2017, 05:41:57 am »
Why are you assuming that your handwritten SSE-tuned ASM should be faster than what FPC produces natively from Pascal code?
It seems to me that you are assuming that we are assuming. We are not.

On top of that do you realize that if you implement a method purely in ASM, targeting a specific instruction set and nothing else, you're essentially crippling the performance of the application for anyone who has a CPU capable of using instructions more recent than the ones you used?
Why are you assuming that our code would target one specific instruction set?

For example, my CPU supports up to AVX2. Why exactly do you suppose I would possibly want to run your SSE ASM version (SSE came out in 1999 on the Pentium 3, remember!) when I could simply set the appropriate FPC compiler flags for AVX2, compile the Pascal implementation, and have a result that will perform much, much, much, much, much faster than your version?
What if some CPU does not support AVX2? plus you had that argument against using assembly recently and here is Marco's answer. Maybe missed it, or forgot it.

On a more general note, what is it with the Object Pascal development community and SSE/SSE2? Why do people continue to believe that that's the be-all-end-all of optimization? What year do you all think it is?!
Again, why are you assuming that? who said SSE* is the be-all-end-all of optimization!!

Akira1364

  • Sr. Member
  • ****
  • Posts: 333
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #7 on: September 14, 2017, 06:02:39 am »
BeanzMaster is Jerome Delauney, maintainer of the Lazarus port of GLScene, is he not? I'm assuming this post probably has something to do with the assembly versions of the methods in GLVectorGeometry.pas. Fantastic library, and I really appreciate the work he does on keeping it compatible with Lazarus, but I definitely always set GEOMETRY_NO_ASM in my GLScene.inc...

To your first point:
He seemed objectively surprised that his handwritten ASM was not faster than the compiled Pascal version, as though it were something that should be surprising in general. (It's not. Almost nobody is as clever as they think they are! Myself included.)

To your second point:
Literally any time you handwrite assembly code, you must be targeting a specific instruction set, or at least not going "above" a certain one.

To your third point:
You obviously didn't understand the gist of what I was saying, which is that you should just write everything in the best Pascal you're capable of and allow people to specify the CPU optimization flags that are specifically relevant to them when compiling. And yes, I read Marcos response. It didn't really mean anything groundbreaking. Plus I think he was ultimately agreeing with me that the compiler generally produces better ASM than ASM handwritten by people?

To your fourth point:
I'm not the one making any assumptions (other than the one in regards to this thread being about GLScene code). Pascal programmers being obsessed with SSE and SSE2 ASM implementations of methods is just "a thing", and has been for years. Maybe you're too young to be aware of that? (And I don't mean that in any kind of insulting way... just in the sense that you may have literally not been around long enough to recognize it as as consistent trend.)
« Last Edit: September 14, 2017, 04:19:55 pm by Akira1364 »

engkin

  • Hero Member
  • *****
  • Posts: 1758
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #8 on: September 14, 2017, 06:42:23 am »
BeanzMaster is Jerome Delauney, maintainer of the Lazarus port of GLScene, is he not? I'm assuming this post probably has something to do with the assembly versions of the methods in GLVectorGeometry.pas. Fantastic library, and I really appreciate the work he does on keeping it compatible with Lazarus, but I definitely always set GEOMETRY_NO_ASM in my GLScene.inc...
As you can see he did not target one instruction set and gave you the option.
At least in this specific example he refuted one of your assumptions.

To your first point:
He seemed objectively surprised that his handwritten ASM was not faster than the compiled Pascal version, as though it were something that should be surprising in general. (It's not. Almost nobody is as clever as they think they are! Myself included.)
He also seemed to be surprised that FPC added some useless code that he believes is a possible reason to reduce the speed of his handwritten assembly code. That's why he started this thread, asking us how to get rid of that code. Laksen tried to explain that "The difference between aligned and unaligned is very significant.". Then the alignment on the stack became an obvious issue.

To your second point:
Literally any time you handwrite assembly code, you must be targeting a specific instruction set, or at least not going "above" a certain one.
Usually you check what the CPU supports and change the path of code based on that. Since SSE is pretty old, it makes perfect sense to start with as it covers a wide range of CPUs.

To your third point:
You obviously didn't understand the gist of what I was saying, which is that you should just write everything in the best Pascal you're capable of and allow people to specify the CPU optimization flags that are specifically relevant to them when compiling. And yes, I read Marcos response. It didn't really mean anything groundbreaking. Plus I think he was ultimately agreeing with me that the compiler generally produces better ASM than ASM handwritten by people?
Let me quote this part here: "one should always have both implementations and compare"
We have an impressive member here, BeRo, you might want to see if he uses any assembly in his code. I am sure if you poked around the RTL you'll notice some handwritten ASM, do you think it is time to replace it with pure pascal?

BeanzMaster

  • Jr. Member
  • **
  • Posts: 69
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #9 on: September 14, 2017, 10:41:43 am »
Hi to all
BeanzMaster is Jerome Delauney, maintainer of the Lazarus port of GLScene, is he not? I'm assuming this post probably has something to do with the assembly versions of the methods in GLVectorGeometry.pas. Fantastic library, and I really appreciate the work he does on keeping it compatible with Lazarus, but I definitely always set GEOMETRY_NO_ASM in my GLScene.inc...
Your right Akira i make some tests for replacing "deprecated" asm functions in VectorGeometry and math units  :)

https://www.freepascal.org/docs-html/ref/refsu15.html
https://www.freepascal.org/docs-html/current/prog/progsu9.html#x16-150001.2.9
Note {$codealign} also handles code on the stack...(local)
Note you may be on your own if you insist on using assembler.... As I understand it it is your responsibility to align the stack.
But Pascal code - so also vectors - will be aligned to the codealign settings.

Anyway: if you use assembler I trust that you also know how to align your stack entries....Otherwise don't bother. Because it is rather basic...

Thanks i tried {$codealign} and the result is a little bit better. But sometime (don't say why in 1.8rc4). The compiler return "Error: Illegal alignment directive".
Also tried with an Array(0..3] of signle, tried with Pointer (PSingle), no changes  Anyway...

Try this code:
Code: Pascal  [Select]
  1. procedure asm_sse_VectorAdd_();assembler;
  2. asm
  3.   movaps xmm0,[RDX] //[V1]
  4.   movups xmm1,[R8] //[V2]
  5.   addps xmm0,xmm1
  6.   movups [RCX], xmm0  //[result]
  7. End;

Same results

After a few readings. It appears that :
  • The FPC compiler work differently than Delphi with the stack.
  • It's seems to be stay a problem between the {$align} and {$packedrecord} directives (not sure about that)
  • The use of asm with SIMD instructions for small functions like the one posted here. Not recommended. The generated FPC code is better (due to its stack management)
    On the other hand, in the case of batch data processing. Writing of a specific asm SSE code for improving performance is possible.

Conclusion, I will keep the code in 'pure pascal'.I think, perhaps, i'll take look later on the side of Cuda/OpenCL (i must do some research on this subject)
I will try an SSE approach during treatmentsof data per batch for some case

It would be nice to compare with Delphi, if in this one the performances are improved or not.

Small question if i compile with AVX options, if i run on a pc that does not have AVX instructions. Does the program switch to SSE instructions, or an error appear ?

Thanks to all


« Last Edit: September 14, 2017, 10:54:41 am by BeanzMaster »

Thaddy

  • Hero Member
  • *****
  • Posts: 4777
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #10 on: September 14, 2017, 12:30:00 pm »
No. If you explicitly set AVX it will generate AVX code, not SSE(X).
"Logically, no number of positive outcomes at the level of experimental testing can confirm a scientific theory, but a single counterexample is logically decisive."

marcov

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 5875
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #11 on: September 14, 2017, 01:01:20 pm »
Quote
After a few readings. It appears that :
  • The FPC compiler work differently than Delphi with the stack.
  • It's seems to be stay a problem between the {$align} and {$packedrecord} directives (not sure about that)
  • The use of asm with SIMD instructions for small functions like the one posted here. Not recommended. The generated FPC code is better (due to its stack management)
    On the other hand, in the case of batch data processing. Writing of a specific asm SSE code for improving performance is possible.

  • The first point is way to generic. It might be a minor detail or an accidentally exploited Delphi bug. To say that FPC works differently is then a bit to broad.
  • I also notice that you pass complex structures (records) to functions, and then assume that they are by reference. Use CONST, or better constref to force that.
  • If you don't want a stackframe, flag the function with nostackframe directive
  • trying to do sse with relative short pieces of code is always dangerous and difficult to tune to multiple compilers
  • for best performance, find the heaviest uses in loops with preferably relatively short bodies, and convert those to assembler as a whole

Quote
Conclusion, I will keep the code in 'pure pascal'.I think, perhaps, i'll take look later on the side of Cuda/OpenCL (i must do some research on this subject)
I will try an SSE approach during treatmentsof data per batch for some case

(CUDA is complicated and still emerging. However the even bigger problem is that at least for me transfering data to and from the GPU was already in the magnitude of the calculations, making it slower over all (image conversions))

Quote
Small question if i compile with AVX options, if i run on a pc that does not have AVX instructions. Does the program switch to SSE instructions, or an error appear ?

No, it will crash. However you can add both sse and avx routines, run a detection, and then set a procvar to the desired routine (or have a DLL in both sse and avx(2) variant).

Note avx is most floating point (Ivy bridge(3xxx), maybe sandy bridge(2xxx) too), and avx2 adds the corresponding integer instructions. (haswel+ (4xxx or newer)).

BeanzMaster

  • Jr. Member
  • **
  • Posts: 69
Re: SSE ASM Function is less accurate than pascal code, why ? and or How to ?
« Reply #12 on: September 14, 2017, 01:18:23 pm »
No. If you explicitly set AVX it will generate AVX code, not SSE(X).
No, it will crash. However you can add both sse and avx routines, run a detection, and then set a procvar to the desired routine (or have a DLL in both sse and avx(2) variant).

Note avx is most floating point (Ivy bridge(3xxx), maybe sandy bridge(2xxx) too), and avx2 adds the corresponding integer instructions. (haswel+ (4xxx or newer)).

Ok Thanks

  • I also notice that you pass complex structures (records) to functions, and then assume that they are by reference. Use CONST, or better constref to force that.
  • If you don't want a stackframe, flag the function with nostackframe directive
  • trying to do sse with relative short pieces of code is always dangerous and difficult to tune to multiple compilers
  • for best performance, find the heaviest uses in loops with preferably relatively short bodies, and convert those to assembler as a whole
  • Use CONST, or better constref to force that : no differences
  • for best performance, find the heaviest uses in loops with preferably relatively short bodies, and convert those to assembler as a whole : It's seems the good way

(CUDA is complicated and still emerging. However the even bigger problem is that at least for me transfering data to and from the GPU was already in the magnitude of the calculations, making it slower over all (image conversions))

Thanks for advice.

So i've converted this Original Delphi code from Eric Grange : https://www.delphitools.info/2011/03/24/kudos-to-the-firefox-4-tracemonkey-team/ for testing alignment (under windows 10 and with  Lazarus 1.8rc4 64bit)

Code: Pascal  [Select]
  1. Unit Unit1;
  2.  
  3. // Original Delphi code from Eric Grange : https://www.delphitools.info/2011/03/24/kudos-to-the-firefox-4-tracemonkey-team/
  4. {$mode objfpc}{$H+}
  5. {.$codealign 16}  // ---> unit1.pas(4,2) Error: Illegal alignment directive
  6. {$Align 16}
  7.  
  8. Interface
  9.  
  10. Uses
  11.   Classes, Sysutils, Windows, Fileutil, Forms, Controls, Graphics, Dialogs, ExtCtrls, StdCtrls,
  12.   math;
  13.  
  14. Type
  15.  
  16.   { Tform1 }
  17.  
  18.   Tform1 = Class(Tform)
  19.     Button1 : Tbutton;
  20.     Checkbox1 : Tcheckbox;
  21.     Image1 : Timage;
  22.     Label1 : Tlabel;
  23.     Panel1 : Tpanel;
  24.     Panel2 : Tpanel;
  25.     Shape1 : Tshape;
  26.     Procedure Button1click(Sender : Tobject);
  27.     Procedure Checkbox1click(Sender : Tobject);
  28.     Procedure Formcreate(Sender : Tobject);
  29.     Procedure Image1mousedown(Sender : Tobject; Button : Tmousebutton; Shift : Tshiftstate; X, Y : Integer);
  30.     Procedure Image1mousemove(Sender : Tobject; Shift : Tshiftstate; X, Y : Integer);
  31.     Procedure Image1mouseup(Sender : Tobject; Button : Tmousebutton; Shift : Tshiftstate; X, Y : Integer);
  32.   Private
  33.  
  34.   Public
  35.     FBitmap : TBitmap;
  36.     FScanLines : array of PInteger;
  37.  
  38.     iterLimit : Integer;
  39.     qmin, qmax, pmin, pmax : Double;
  40.     controlColors : array of TColor;
  41.     colors : array of TColor;
  42.     mbX, mbY : Integer;
  43.  
  44.     procedure ResetMandel;
  45.  
  46.     procedure ResetControlColors;
  47.     procedure ComputeColors;
  48.  
  49.     procedure ComputeMandel;
  50.     procedure ComputeMandelDelphi;
  51.     procedure ComputeMandelSSE;
  52.  
  53.     procedure DrawPixel(x, y, c : Integer);
  54.  
  55.   End;
  56.  
  57. Var
  58.   Form1 : Tform1;
  59.  
  60. Implementation
  61.  
  62. {$R *.lfm}
  63.  
  64. const
  65.    MAX_COLORS = 512;
  66.  
  67. Procedure Tform1.Formcreate(Sender : Tobject);
  68. begin
  69. (*   FBitmap:=TBitmap.Create;
  70.    FBitmap.PixelFormat:=pf32bit;
  71.    FBitmap.SetSize(Image1.Width, Image1.Height);
  72.      SetLength(FScanLines, FBitmap.Height);
  73.    for i:=0 to FBitmap.Height-1 do
  74.       FScanLines[i]:=PInteger(FBitmap.RawImage.GetLineStart(i)); *)
  75.    ResetMandel;
  76.    ResetControlColors;
  77.    ComputeColors;
  78.    //Label1.Caption:='Double Size : '+InttoStr(SizeOf(Double)); //give 8 so 8x8bits = 64bits
  79.  //  ComputeMandel;
  80. End;
  81.  
  82. Procedure Tform1.Button1click(Sender : Tobject);
  83. Begin
  84.    ResetMandel;
  85.    ComputeMandel;
  86. End;
  87.  
  88. Procedure Tform1.Checkbox1click(Sender : Tobject);
  89. Begin
  90.    ComputeMandel;
  91. End;
  92.  
  93. Procedure Tform1.Image1mousedown(Sender : Tobject; Button : Tmousebutton; Shift : Tshiftstate; X, Y : Integer);
  94. Begin
  95.    mbX:=X;
  96.    mbY:=Y;
  97. End;
  98.  
  99. Procedure Tform1.Image1mousemove(Sender : Tobject; Shift : Tshiftstate; X, Y : Integer);
  100. var
  101.    s : Integer;
  102. begin
  103.    if ssLeft in Shift then begin
  104.       s := Max(X-mbX, Y-mbY);
  105.       if s>0 then begin
  106.          Shape1.SetBounds(mbX+Image1.Left, mbY+Image1.Top, s, s);
  107.          Shape1.Visible:=True;
  108.       end;
  109.    end;
  110. End;
  111.  
  112. Procedure Tform1.Image1mouseup(Sender : Tobject; Button : Tmousebutton; Shift : Tshiftstate; X, Y : Integer);
  113. var
  114.    s : Integer;
  115.    pw, qw : Double;
  116. begin
  117.    Shape1.Visible:=False;
  118.  
  119.    s:=Max(X-mbX, Y-mbY);
  120.    if (s>3) then
  121.    begin
  122.       X := mbX + s;
  123.       Y := mbY + s;
  124.       pw := pmax - pmin;
  125.       pmin := pmin + mbX * pw / FBitmap.Width;
  126.       pmax := pmax - (FBitmap.Width - X) * pw / FBitmap.Width;
  127.       qw := qmax - qmin;
  128.       qmin := qmin + (FBitmap.Height - Y) * qw / FBitmap.Height;
  129.       qmax := qmax - mbY * qw / FBitmap.Height;
  130.  
  131.       ComputeMandel;
  132.    end;
  133. End;
  134.  
  135. procedure TForm1.ResetMandel;
  136. begin
  137.    iterLimit := 100;
  138.    qmin := -1.5;
  139.    qmax := 1.5;
  140.    pmin := -2.25;
  141.    pmax := 0.75;
  142. end;
  143.  
  144. // ResetControlColors
  145. //
  146. procedure TForm1.ResetControlColors;
  147. begin
  148.    SetLength(controlColors, 5);
  149.  
  150.    controlColors[0] := RGBToColor($20, $00, $00);
  151.    controlColors[1] := RGBToColor($ff, $ff, $ff);
  152.    controlColors[2] := RGBToColor($A0, $00, $00);
  153.    controlColors[3] := RGBToColor($FF, $ff, $40);
  154.    controlColors[4] := RGBToColor($FF, $20, $20);
  155. end;
  156.  
  157. // ComputeMandel
  158. //
  159. procedure TForm1.ComputeMandel;
  160. var
  161.    start, stop, freq : Int64;
  162.    i:Integer;
  163. begin
  164.    // We must reset FBitmap to take change in Image1
  165.    if Assigned(FBitmap) then FreeAndNil(FBitmap);
  166.    FBitmap:=TBitmap.Create;
  167.    FBitmap.PixelFormat:=pf32bit;
  168.    FBitmap.SetSize(Image1.Width, Image1.Height);
  169.      SetLength(FScanLines, FBitmap.Height);
  170.    for i:=0 to FBitmap.Height-1 do
  171.       FScanLines[i]:=PInteger(FBitmap.RawImage.GetLineStart(i));
  172.  
  173.    Start:=0; Stop:=0; freq:=1;
  174.    QueryPerformanceCounter(start);
  175.    FBitmap.BeginUpdate();
  176.    if CheckBox1.Checked then ComputeMandelSSE
  177.    else ComputeMandelDelphi;
  178.    FBitmap.EndUpdate();
  179.    QueryPerformanceCounter(stop);
  180.    QueryPerformanceFrequency(freq);
  181.  
  182.    Image1.Picture.Bitmap := FBitmap;
  183.  
  184.    Label1.Caption:='Generate in '+Format('%.1f milliseconds', [(stop-start)/freq*1000]);
  185. end;
  186.  
  187. // ComputeMandelDelphi
  188. //
  189. procedure TForm1.ComputeMandelDelphi;
  190. const
  191.    kmax = 256;
  192. var
  193.    xstep, ystep : Double;
  194.    x, y, r : Double;
  195.    sx, sy, k : Integer;
  196.    p, q, x0, y0 : Double;
  197. begin
  198.    xstep := (pmax - pmin) / FBitmap.Width;
  199.    ystep := (qmax - qmin) / FBitmap.Height;
  200.  
  201.    for sx := 0 to FBitmap.Width-1 do begin
  202.       for sy := 0 to FBitmap.Height-1 do begin
  203.  
  204.          p := pmin + xstep * sx;
  205.          q := qmax - ystep * sy;
  206.          k := 0;
  207.          x0 := 0;
  208.          y0 := 0;
  209.  
  210.          repeat
  211.             x := x0 * x0 - y0 * y0 + p;
  212.             y := 2 * x0 * y0 + q;
  213.             x0 := x;
  214.             y0 := y;
  215.             r := x * x + y * y;
  216.             Inc(k);
  217.          until ((r > iterLimit) or (k >= kmax));
  218.  
  219.          if k >= kmax then
  220.             k := 0;
  221.  
  222.          DrawPixel(sx, sy, k);
  223.       end;
  224.    end;
  225. end;
  226.  
  227. // From https://github.com/UltraStar-Deluxe/USDX/blob/master/src/base/UCommon.pas
  228. type
  229.   // stores the unaligned pointer of data allocated by GetAlignedMem()
  230.   PMemAlignHeader = ^TMemAlignHeader;
  231.   TMemAlignHeader = pointer;
  232.  
  233. (**
  234.  * Use this function to assure that allocated memory is aligned on a specific
  235.  * byte boundary.
  236.  * Alignment must be a power of 2.
  237.  *
  238.  * Important: Memory allocated with GetAlignedMem() MUST be freed with
  239.  * FreeAlignedMem(), FreeMem() will cause a segmentation fault.
  240.  *
  241.  * Hint: If you do not need dynamic memory, consider to allocate memory
  242.  * statically and use the {$ALIGN x} compiler directive. Note that delphi
  243.  * supports an alignment "x" of up to 8 bytes only whereas FPC supports
  244.  * alignments on 16 and 32 byte boundaries too.
  245.  *)
  246. {$WARNINGS OFF}
  247. function GetAlignedMem(Size: cardinal; Alignment: integer): pointer;
  248. var
  249.   OrigPtr: pointer;
  250. const
  251.   MIN_ALIGNMENT = 16;
  252. begin
  253.   // Delphi and FPC (tested with 2.2.0) align memory blocks allocated with
  254.   // GetMem() at least on 8 byte boundaries. Delphi uses a minimal alignment
  255.   // of either 8 or 16 bytes depending on the size of the requested block
  256.   // (see System.GetMinimumBlockAlignment). As we do not want to change the
  257.   // boundary for the worse, we align at least on MIN_ALIGN.
  258.   if (Alignment < MIN_ALIGNMENT) then
  259.     Alignment := MIN_ALIGNMENT;
  260.  
  261.   // allocate unaligned memory
  262.   GetMem(OrigPtr, SizeOf(TMemAlignHeader) + Size + Alignment);
  263.   if (OrigPtr = nil) then
  264.   begin
  265.     Result := nil;
  266.     Exit;
  267.   end;
  268.  
  269.   // reserve space for the header
  270.   Result := pointer(PtrUInt(OrigPtr) + SizeOf(TMemAlignHeader));
  271.   // align memory
  272.   Result := pointer(PtrUInt(Result) + Alignment - PtrUInt(Result) mod Alignment);
  273.  
  274.   // set header with info on old pointer for FreeMem
  275.   PMemAlignHeader(PtrUInt(Result) - SizeOf(TMemAlignHeader))^ := OrigPtr;
  276. end;
  277. {$WARNINGS ON}
  278.  
  279. {$WARNINGS OFF}
  280. procedure FreeAlignedMem(P: pointer);
  281. begin
  282.   if (P <> nil) then
  283.     FreeMem(PMemAlignHeader(PtrUInt(P) - SizeOf(TMemAlignHeader))^);
  284. end;
  285. {$WARNINGS ON}
  286.  
  287.  
  288. // ComputeMandelSSE
  289. //
  290. procedure TForm1.ComputeMandelSSE;
  291. const
  292.    kmax = 256;
  293.   //c2 : Double = 2.0; //By using this const a SIGSEV is throw at : movsd xmm4, c2 /  mulsd xmm1,c2
  294. var
  295.    xstep, ystep : Double;
  296.    r : Double;
  297.    sx, sy, k : Integer;
  298.    p, q, x0, y0 : Double;
  299.    c2 : Double;
  300.    _p, _q, _x0, _y0, _c2,_r : PDouble;
  301.    AlignedDoubleSize : Cardinal;
  302. begin
  303.    c2 := 2.0;
  304.   // AlignedDoubleSize := 2*Sizeof(Double); //128bits
  305.   // _p := PDouble(GetAlignedMem(AlignedDoubleSize,16));
  306.  //  _q := PDouble(GetAlignedMem(AlignedDoubleSize,16));
  307.  //  _x0 := PDouble(GetAlignedMem(AlignedDoubleSize,16));
  308.  //  _y0 := PDouble(GetAlignedMem(AlignedDoubleSize,16));
  309.   //  _r := PDouble(GetAlignedMem(AlignedDoubleSize,16));
  310.   // _c2 := PDouble(GetAlignedMem(AlignedDoubleSize,16));
  311.  //  _c2^:= c2; inc(_c2); _x0^:=c2;
  312.  
  313.    xstep := (pmax - pmin) / FBitmap.Width;
  314.    ystep := (qmax - qmin) / FBitmap.Height;
  315.  
  316.    for sx := 0 to FBitmap.Width-1 do
  317.    begin
  318.       for sy := 0 to FBitmap.Height-1 do
  319.       begin
  320.          p := pmin + xstep * sx;
  321.          q := qmax - ystep * sy;
  322.          k := 0;
  323.          x0 := 0;
  324.          y0 := 0;
  325.       //   _p^:= p; inc(_p); _p^:=p;
  326.       //   _q^:= p; inc(_q); _q^:=p;
  327.       //   _x0^:= p; inc(_x0); _x0^:=p;
  328.       //   _y0^:= p; inc(_y0); _y0^:=p;
  329.          asm
  330.             movsd xmm0, _x0;
  331.             movsd xmm1, _y0;
  332.           //  movsd xmm4, c2
  333.          end;
  334.  
  335.          repeat
  336.             asm
  337.                // x := x0 * x0 - y0 * y0 + p;
  338.                movsd xmm2, xmm0
  339.                mulsd xmm2, xmm2
  340.                movsd xmm3, xmm1
  341.                mulsd xmm3, xmm3
  342.                subsd xmm2, xmm3
  343.                movsd xmm4, p
  344.                addsd xmm2, xmm4 //p
  345.                // y := 2 * x0 * y0 + q;
  346.                // y0 :=y
  347.                mulsd xmm1, xmm0
  348.                movsd xmm4, c2
  349.                mulsd xmm1, xmm4
  350.                movsd xmm4, q        
  351.                addsd xmm1, xmm4
  352.                // x0 := x
  353.                movsd xmm0, xmm2
  354.                // r := x * x + y * y;
  355.                mulsd xmm2, xmm2
  356.                movsd xmm3, xmm1
  357.                mulsd xmm3, xmm1
  358.                addsd xmm2, xmm3
  359.                movsd r, xmm2
  360.             end;
  361.             Inc(k);
  362.          until ((r > iterLimit) or (k >= kmax));
  363.  
  364.          if k >= kmax then k := 0;
  365.  
  366.          DrawPixel(sx, sy, k);
  367.       end;
  368.    end;
  369. //  FreeAlignedMem(_p);
  370. //  FreeAlignedMem(_q);
  371. //  FreeAlignedMem(_x0);
  372. //  FreeAlignedMem(_y0);
  373.  // FreeAlignedMem(_r);
  374.  // FreeAlignedMem(_c2);
  375. end;
  376.  
  377. // ComputeColors
  378. //
  379.  
  380.  
  381. procedure TForm1.ComputeColors;
  382. var
  383.    i, k : Integer;
  384.    rstep, bstep, gstep : Double;
  385. begin
  386.    SetLength(colors, MAX_COLORS);
  387.  
  388.    colors[0] := RGB(0, 0, 0);
  389.  
  390.    for i:=0 to High(controlColors) do begin
  391.       rstep := (GetRValue(controlColors[i + 1]) - GetRValue(controlColors[i])) / 63;
  392.       gstep := (GetGValue(controlColors[i + 1]) - GetGValue(controlColors[i])) / 63;
  393.       bstep := (GetBValue(controlColors[i + 1]) - GetBValue(controlColors[i])) / 63;
  394.  
  395.       for k:=0 to 63 do
  396.          colors[k + (i * 64) + 1] := RGB(Round(GetRValue(controlColors[i]) + rstep * k),
  397.                                          Round(GetGValue(controlColors[i]) + gstep * k),
  398.                                          Round(GetBValue(controlColors[i]) + bstep * k));
  399.    end;
  400.  
  401.     for i := 257 to MAX_COLORS-1 do
  402.        colors[i] := colors[i - 256];
  403. end;
  404.  
  405. // DrawPixel
  406. //
  407. procedure TForm1.DrawPixel(x, y, c : Integer); inline;
  408. begin
  409.    PInteger(FScanLines[y]+(x))^:= colors[c];
  410. end;
  411.  
  412.  
  413. End.
  414.  
  415.  

There are compiler's warning (same with GetAlignedMem) and {$align} seems to not change something

Quote
Compilation du projet - Cible : project1.exe : Succès - Avertissements : 6
unit1.pas(325,25) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(326,25) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(338,28) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(343,28) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(346,28) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"
unit1.pas(356,25) Warning: Check size of memory operand "movsd: memory-operand-size is 64 bits, but expected [128 bits]"

And 2 Screenshoots : 1st Without SSE 2nd / With SSE

And like we see SSE produce bad result

Have you some clues, for i'm understanding what i'm forgot, with "Data Alignment" scheme ?

Best regards

 

Recent

Get Lazarus at SourceForge.net. Fast, secure and Free Open Source software downloads Open Hub project report for Lazarus