Recent

Author Topic: FPC for high-performance computing  (Read 9061 times)

LV

  • Sr. Member
  • ****
  • Posts: 393
Re: FPC for high-performance computing
« Reply #15 on: May 18, 2025, 05:27:17 pm »
@Martin_fr, could you run chacha20's code with loop unrolling using the fpc trunk compiler? Thanks.

Code: Pascal  [Select][+][-]
  1. program Project1;
  2.  
  3. uses sysutils, strutils;
  4.  
  5. type
  6.   chacha20state = record
  7.     state: array[0..15] of dword;
  8.     keystream: array[0..15] of dword;
  9.     position: dword;
  10.   end;
  11.  
  12. procedure chacha20_set_counter(var state: chacha20state; counter: dword);
  13. begin
  14.   state.state[12] := counter;
  15.   state.position := 64;
  16. end;
  17.  
  18. procedure chacha20_init(var state: chacha20state; key, nonce: string; counter: dword=0);
  19. const
  20.   magic = 'expand 32-byte k';
  21. begin
  22.   fillchar(state, sizeof(state), 0);
  23.   move(magic[1], state.state[0], 16);
  24.   if length(key) > 32 then setlength(key, 32);
  25.   if key <> '' then move(key[1], state.state[4], length(key));
  26.   chacha20_set_counter(state, counter);
  27.   if length(nonce) > 12 then setlength(nonce, 12);
  28.   if nonce <> '' then move(nonce[1], state.state[13], length(nonce));
  29. end;
  30.  
  31. function rotl32(x, n: dword): dword; inline;
  32. begin
  33.   result := (x shl n) or (x shr (32-n));
  34. end;
  35.  
  36. procedure chacha20_quarterround(p: pdword; a, b, c, d: integer); inline;
  37. var
  38.   a_val, b_val, c_val, d_val: dword;
  39. begin
  40.   a_val := p[a];
  41.   b_val := p[b];
  42.   c_val := p[c];
  43.   d_val := p[d];
  44.  
  45.   a_val += b_val;
  46.   d_val := rotl32(d_val xor a_val, 16);
  47.   c_val += d_val;
  48.   b_val := rotl32(b_val xor c_val, 12);
  49.   a_val += b_val;
  50.   d_val := rotl32(d_val xor a_val, 8);
  51.   c_val += d_val;
  52.   b_val := rotl32(b_val xor c_val, 7);
  53.  
  54.   p[a] := a_val;
  55.   p[b] := b_val;
  56.   p[c] := c_val;
  57.   p[d] := d_val;
  58. end;
  59.  
  60. procedure chacha20_next_block(var state: chacha20state);
  61. var
  62.   i: integer;
  63. begin
  64.   move(state.state, state.keystream, 64);
  65.   for i := 1 to 10 do begin
  66.     // Column rounds
  67.     chacha20_quarterround(@state.keystream, 0, 4, 8, 12);
  68.     chacha20_quarterround(@state.keystream, 1, 5, 9, 13);
  69.     chacha20_quarterround(@state.keystream, 2, 6, 10, 14);
  70.     chacha20_quarterround(@state.keystream, 3, 7, 11, 15);
  71.     // Diagonal rounds
  72.     chacha20_quarterround(@state.keystream, 0, 5, 10, 15);
  73.     chacha20_quarterround(@state.keystream, 1, 6, 11, 12);
  74.     chacha20_quarterround(@state.keystream, 2, 7, 8, 13);
  75.     chacha20_quarterround(@state.keystream, 3, 4, 9, 14);
  76.   end;
  77.   for i := 0 to high(state.keystream) do
  78.     state.keystream[i] += state.state[i];
  79.   state.state[12] += 1;
  80.   state.position := 0;
  81. end;
  82.  
  83. procedure chacha20_xor(var state: chacha20state; data: pointer; len: dword);
  84. var
  85.   p: PByte;
  86.   remain, block_count, i: dword;
  87.   pData, pKey: PDWord;
  88. begin
  89.   p := data;
  90.   remain := 64 - state.position;
  91.  
  92.   if remain > 0 then begin
  93.     if remain >= 4 then begin
  94.       pData := PDWord(p);
  95.       pKey := @state.keystream[state.position div 4];
  96.       for i := 0 to (remain div 4) - 1 do begin
  97.         pData[i] := pData[i] xor pKey[i];
  98.       end;
  99.       inc(p, (remain div 4) * 4);
  100.       dec(len, (remain div 4) * 4);
  101.       inc(state.position, (remain div 4) * 4);
  102.       remain := remain mod 4;
  103.     end;
  104.  
  105.     if remain > 0 then begin
  106.       if remain > len then remain := len;
  107.       for i := 0 to remain - 1 do
  108.         p[i] := p[i] xor pbyte(@state.keystream[state.position + i])^;
  109.       inc(p, remain);
  110.       dec(len, remain);
  111.       inc(state.position, remain);
  112.     end;
  113.   end;
  114.  
  115.   block_count := len div 64;
  116.   if block_count > 0 then begin
  117.     pData := PDWord(p);
  118.     for i := 0 to block_count - 1 do begin
  119.       if state.position >= 64 then
  120.         chacha20_next_block(state);
  121.       pKey := @state.keystream[0];
  122.  
  123.       pData[0] := pData[0] xor pKey[0];
  124.       pData[1] := pData[1] xor pKey[1];
  125.       pData[2] := pData[2] xor pKey[2];
  126.       pData[3] := pData[3] xor pKey[3];
  127.       pData[4] := pData[4] xor pKey[4];
  128.       pData[5] := pData[5] xor pKey[5];
  129.       pData[6] := pData[6] xor pKey[6];
  130.       pData[7] := pData[7] xor pKey[7];
  131.       pData[8] := pData[8] xor pKey[8];
  132.       pData[9] := pData[9] xor pKey[9];
  133.       pData[10] := pData[10] xor pKey[10];
  134.       pData[11] := pData[11] xor pKey[11];
  135.       pData[12] := pData[12] xor pKey[12];
  136.       pData[13] := pData[13] xor pKey[13];
  137.       pData[14] := pData[14] xor pKey[14];
  138.       pData[15] := pData[15] xor pKey[15];
  139.  
  140.       inc(pData, 16);
  141.       state.position := 64;
  142.     end;
  143.     len := len mod 64;
  144.     p := PByte(pData);
  145.   end;
  146.  
  147.   if len > 0 then begin
  148.     if state.position >= 64 then
  149.       chacha20_next_block(state);
  150.     if len >= 4 then begin
  151.       pData := PDWord(p);
  152.       pKey := @state.keystream[state.position div 4];
  153.       for i := 0 to (len div 4) - 1 do begin
  154.         pData[i] := pData[i] xor pKey[i];
  155.       end;
  156.       inc(p, (len div 4) * 4);
  157.       dec(len, (len div 4) * 4);
  158.       inc(state.position, (len div 4) * 4);
  159.     end;
  160.     for i := 0 to len - 1 do
  161.       p[i] := p[i] xor pbyte(@state.keystream[state.position + i])^;
  162.     inc(state.position, len);
  163.   end;
  164. end;
  165.  
  166. const
  167.   size = 1024*1024*1024;
  168.  
  169. var
  170.   cc: chacha20state;
  171.   data: pointer;
  172.   start, end_: qword;
  173.  
  174. begin
  175.   chacha20_init(cc, DupeString('a', 32), DupeString('b', 12));
  176.   data := getmem(size);
  177.   start := GetTickCount64;
  178.   chacha20_xor(cc, data, size);
  179.   end_ := GetTickCount64;
  180.   writeln('FPC 3.2.2 Unrolled  ',end_-start, ' ms');
  181.   writeln('verify = ', pint32(@cc.keystream)^);
  182.   writeln(Format('%.2f MB/s', [(size/(1024*1024))/((end_-start)/1000)]));
  183.   readln;
  184. end.
  185.  

Martin_fr

  • Administrator
  • Hero Member
  • *
  • Posts: 12024
  • Debugger - SynEdit - and more
    • wiki
Re: FPC for high-performance computing
« Reply #16 on: May 18, 2025, 05:54:13 pm »
Code: Text  [Select][+][-]
  1. FPC 3.2.2 Unrolled  2578 ms
  2. verify = 17398848
  3. 397.21 MB/s

I tested his code with trunk / O3 and {$Optimization noLOOPUNROLL} to make sure trunk hadn't simply done more unroll already. But that made no difference.

Btw Core I7-8700K




I was actually surprised of the huge improvement. I know trunk had some work done. But in other tests, the gains were around 4% to 5%.

From what I have seen (commits / mail list) a lot of work went into the peephole optimizer. That are usually local optimizations (register usages over short bits of code, avoid storing to mem, change 2 statements to avoid wait cycles...)

I don't know if there was any work on other optimization parts such as DFA and the overall register allocation.

LV

  • Sr. Member
  • ****
  • Posts: 393
Re: FPC for high-performance computing
« Reply #17 on: May 18, 2025, 06:00:46 pm »
Thank you very much, I am delighted with the work of the FPC-Lazarus team. :)

Jorg3000

  • Jr. Member
  • **
  • Posts: 81
Re: FPC for high-performance computing
« Reply #18 on: May 18, 2025, 06:18:34 pm »
Is there a speed change with the following code change?

*EDIT*  Oops, this is based on the code from page 1 and may already be outdated.

Code: Pascal  [Select][+][-]
  1. procedure chacha20_xor(var state: chacha20state; data: pointer; len: dword);
  2. var
  3.   i: dword;
  4.   p: PByte;
  5.   pKey0: Pointer;
  6. begin
  7.   p := data;
  8.   pKey0 := @state.keystream[0];
  9.  
  10.   for i := 0 to len-1 do begin
  11.     if state.position >= 64 then chacha20_next_block(state);
  12.     p^ := p^ xor pbyte(pKey0 + state.position)^;
  13.     inc(p);
  14.     inc(state.position);
  15.   end;
  16. end;


« Last Edit: May 18, 2025, 06:21:59 pm by Jorg3000 »

ALLIGATOR

  • Sr. Member
  • ****
  • Posts: 333
  • I use FPC [main] 💪🐯💪
Re: FPC for high-performance computing
« Reply #19 on: May 18, 2025, 07:32:44 pm »
I commented out that section of code completely and the Verify value did not change. How can I verify that the algorithm is working correctly?

https://forum.lazarus.freepascal.org/index.php/topic,71174.msg555295.html#msg555295
Code: Pascal  [Select][+][-]
  1.       //pData[0] := pData[0] xor pKey[0];
  2.       //pData[1] := pData[1] xor pKey[1];
  3.       //pData[2] := pData[2] xor pKey[2];
  4.       //pData[3] := pData[3] xor pKey[3];
  5.       //pData[4] := pData[4] xor pKey[4];
  6.       //pData[5] := pData[5] xor pKey[5];
  7.       //pData[6] := pData[6] xor pKey[6];
  8.       //pData[7] := pData[7] xor pKey[7];
  9.       //pData[8] := pData[8] xor pKey[8];
  10.       //pData[9] := pData[9] xor pKey[9];
  11.       //pData[10] := pData[10] xor pKey[10];
  12.       //pData[11] := pData[11] xor pKey[11];
  13.       //pData[12] := pData[12] xor pKey[12];
  14.       //pData[13] := pData[13] xor pKey[13];
  15.       //pData[14] := pData[14] xor pKey[14];
  16.       //pData[15] := pData[15] xor pKey[15];
« Last Edit: May 18, 2025, 07:56:01 pm by ALLIGATOR »
I may seem rude - please don't take it personally

Fibonacci

  • Hero Member
  • *****
  • Posts: 788
  • Internal Error Hunter
Re: FPC for high-performance computing
« Reply #20 on: May 18, 2025, 07:55:58 pm »
You commented out data encryption (xoring), while the verify value is the first 4 bytes of the last 64-byte keystream block.

But I checked the data and it is correct, same output as libsodium.

Here with data verification, the last 4 bytes:

Code: Pascal  [Select][+][-]
  1. begin
  2.   chacha20_init(cc, DupeString('a', 32), DupeString('b', 12));
  3.   data := getmem(size);
  4.   fillchar(data^, size, 'x');
  5.   start := GetTickCount64;
  6.   chacha20_xor(cc, data, size);
  7.   end_ := GetTickCount64;
  8.   writeln(end_-start, ' ms');
  9.   writeln('verify = ', pint32(@cc.keystream)^);
  10.   writeln('verify2 = ', pint32(data+size-4)^);
  11.   writeln(Format('%.2f MB/s', [(size/(1024*1024))/((end_-start)/1000)]));
  12.   readln;
  13. end.



Code: Text  [Select][+][-]
  1. FPC 3.3.1 unrolled: 2765 ms verify2 = -643586287  370,34 MB/s
  2. FPC+libsodium:       547 ms verify2 = -643586287 1872,03 MB/s



@Jorg3000: no change at all
« Last Edit: May 18, 2025, 11:11:18 pm by Fibonacci »

LV

  • Sr. Member
  • ****
  • Posts: 393
Re: FPC for high-performance computing
« Reply #21 on: May 28, 2025, 10:36:04 pm »
Perhaps someone will find this topic interesting.  ;)
I compare implementations of a 1D linear transport equation solver using the explicit upwind scheme:
Domain: 1.0 (discretized into 1,000,000 grid points)
Wave speed: c = 1.0
CFL condition: 0.9
Simulation time: 0.01.

Win11; i7-8700
Code: Text  [Select][+][-]
  1. -------------------------------------------------------------------------
  2.             | GFortan & OpenMP   |    C++ & OpenMP    | FPC & TThread   |
  3.             |  (GCC 14.2.0 -O3)  |   (GCC 14.2.0 -O3) |   3.2.2 (-O3)   |
  4. -------------------------------------------------------------------------
  5. ExTime,  s  |       10.7         |          10.7      |      10.3       |
  6. -------------------------------------------------------------------------
  7.  

Source codes in the appendix.
« Last Edit: May 28, 2025, 10:37:58 pm by LV »

LV

  • Sr. Member
  • ****
  • Posts: 393
Re: FPC for high-performance computing
« Reply #22 on: May 31, 2025, 01:48:47 pm »
Today, I switched to a laptop with an i7-12700H processor and ran the task from the previous post with an increased simulation time of 0.1. The results puzzled me - FPC is twice as fast as C++ GCC.  :o

output (C++ & OpenMP GCC 14.2.0 -O3):

Code: Text  [Select][+][-]
  1. Total steps: 111112
  2. Execution time: 82.014 seconds
  3. Press Enter to exit...
  4.  

output (FPC & TThread 3.2.2 -O3):

Code: Text  [Select][+][-]
  1. Enter number of threads: 16
  2. Total steps: 111112
  3. Execution time: 38.69 seconds
  4. Press Enter to exit...
  5.  

srvaldez

  • Full Member
  • ***
  • Posts: 187
Re: FPC for high-performance computing
« Reply #23 on: May 31, 2025, 03:51:43 pm »
my times
Code: [Select]
fortran
Total steps: 111112
gfortran    Execution time:    115.232 seconds, 92.776 seconds with openmp
Intel ifort Execution time:     87.984 seconds
Intel ifort Execution time:     48.819 seconds with Qiopenmp

cpp
Total steps: 111112
g+++  Execution time: 109.546 seconds, 61.624 seconds with openmp
Intel Execution time: 89.768 seconds
Intel Execution time: 38.194 seconds with Qiopenmp

pascal
Enter number of threads: 16
Total steps: 111112
Execution time: 61.19 seconds
« Last Edit: May 31, 2025, 04:05:15 pm by srvaldez »

marcov

  • Administrator
  • Hero Member
  • *
  • Posts: 12597
  • FPC developer.
Re: FPC for high-performance computing
« Reply #24 on: May 31, 2025, 05:03:12 pm »
Today, I switched to a laptop with an i7-12700H processor and ran the task from the previous post with an increased simulation time of 0.1. The results puzzled me - FPC is twice as fast as C++ GCC.  :o

It might be the difference of the  thread pool system based on recreating vs recycling worker threads. That might have some influence on which core  a job is initially scheduled

Thaddy

  • Hero Member
  • *****
  • Posts: 18704
  • To Europe: simply sell USA bonds: dollar collapses
Re: FPC for high-performance computing
« Reply #25 on: May 31, 2025, 09:31:18 pm »
What puzzles me is that I do not immediately see optimization settings.
Especially on the Pascal side -CfXXX and -Sv
As we all know the FPC defaults are really conservative.
If Europe sells their USA bonds the USD will collapse. Europe can affort that given average state debts. The USA can't affort that. Just an advice...

LV

  • Sr. Member
  • ****
  • Posts: 393
Re: FPC for high-performance computing
« Reply #26 on: June 01, 2025, 06:43:27 pm »
@srvaldez, thank you for testing. Which hardware did you use to obtain these results? @marcov and @Thaddy, I appreciate your helpful advice. There is a lot to consider.

srvaldez

  • Full Member
  • ***
  • Posts: 187
Re: FPC for high-performance computing
« Reply #27 on: June 01, 2025, 07:24:01 pm »
@srvaldez, thank you for testing. Which hardware did you use to obtain these results?
Quote
OS Name   Microsoft Windows 11 Pro
System Type   x64-based PC
Processor   Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz, 3600 Mhz, 8 Core(s), 16 Logical Processor(s)

gues1

  • Guest
Re: FPC for high-performance computing
« Reply #28 on: June 01, 2025, 09:44:14 pm »
Enter number of threads: 16
Total steps: 11112
Execution time: 2.29 seconds
Press Enter to exit...

Win11, I9-14900HX- 8 PCores, 16 ECores
But with Delphi 12.3, 64 bit app, Release mode, I don't have Lazarus now to test.

gues1

  • Guest
Re: FPC for high-performance computing
« Reply #29 on: June 01, 2025, 10:00:48 pm »
Enter number of threads: 16
Total steps: 11112
Execution time: 2.31 seconds
Press Enter to exit...

Win11, I9-14900HX- 8 PCores, 16 ECores
With Lazarus 4.0, FPC 3.2.2, Optiomization -O3

 

TinyPortal © 2005-2018