Recent

Author Topic: FPC for high-performance computing  (Read 6086 times)

LV

  • Sr. Member
  • ****
  • Posts: 286
Re: FPC for high-performance computing
« Reply #15 on: May 18, 2025, 05:27:17 pm »
@Martin_fr, could you run chacha20's code with loop unrolling using the fpc trunk compiler? Thanks.

Code: Pascal  [Select][+][-]
  1. program Project1;
  2.  
  3. uses sysutils, strutils;
  4.  
  5. type
  6.   chacha20state = record
  7.     state: array[0..15] of dword;
  8.     keystream: array[0..15] of dword;
  9.     position: dword;
  10.   end;
  11.  
  12. procedure chacha20_set_counter(var state: chacha20state; counter: dword);
  13. begin
  14.   state.state[12] := counter;
  15.   state.position := 64;
  16. end;
  17.  
  18. procedure chacha20_init(var state: chacha20state; key, nonce: string; counter: dword=0);
  19. const
  20.   magic = 'expand 32-byte k';
  21. begin
  22.   fillchar(state, sizeof(state), 0);
  23.   move(magic[1], state.state[0], 16);
  24.   if length(key) > 32 then setlength(key, 32);
  25.   if key <> '' then move(key[1], state.state[4], length(key));
  26.   chacha20_set_counter(state, counter);
  27.   if length(nonce) > 12 then setlength(nonce, 12);
  28.   if nonce <> '' then move(nonce[1], state.state[13], length(nonce));
  29. end;
  30.  
  31. function rotl32(x, n: dword): dword; inline;
  32. begin
  33.   result := (x shl n) or (x shr (32-n));
  34. end;
  35.  
  36. procedure chacha20_quarterround(p: pdword; a, b, c, d: integer); inline;
  37. var
  38.   a_val, b_val, c_val, d_val: dword;
  39. begin
  40.   a_val := p[a];
  41.   b_val := p[b];
  42.   c_val := p[c];
  43.   d_val := p[d];
  44.  
  45.   a_val += b_val;
  46.   d_val := rotl32(d_val xor a_val, 16);
  47.   c_val += d_val;
  48.   b_val := rotl32(b_val xor c_val, 12);
  49.   a_val += b_val;
  50.   d_val := rotl32(d_val xor a_val, 8);
  51.   c_val += d_val;
  52.   b_val := rotl32(b_val xor c_val, 7);
  53.  
  54.   p[a] := a_val;
  55.   p[b] := b_val;
  56.   p[c] := c_val;
  57.   p[d] := d_val;
  58. end;
  59.  
  60. procedure chacha20_next_block(var state: chacha20state);
  61. var
  62.   i: integer;
  63. begin
  64.   move(state.state, state.keystream, 64);
  65.   for i := 1 to 10 do begin
  66.     // Column rounds
  67.     chacha20_quarterround(@state.keystream, 0, 4, 8, 12);
  68.     chacha20_quarterround(@state.keystream, 1, 5, 9, 13);
  69.     chacha20_quarterround(@state.keystream, 2, 6, 10, 14);
  70.     chacha20_quarterround(@state.keystream, 3, 7, 11, 15);
  71.     // Diagonal rounds
  72.     chacha20_quarterround(@state.keystream, 0, 5, 10, 15);
  73.     chacha20_quarterround(@state.keystream, 1, 6, 11, 12);
  74.     chacha20_quarterround(@state.keystream, 2, 7, 8, 13);
  75.     chacha20_quarterround(@state.keystream, 3, 4, 9, 14);
  76.   end;
  77.   for i := 0 to high(state.keystream) do
  78.     state.keystream[i] += state.state[i];
  79.   state.state[12] += 1;
  80.   state.position := 0;
  81. end;
  82.  
  83. procedure chacha20_xor(var state: chacha20state; data: pointer; len: dword);
  84. var
  85.   p: PByte;
  86.   remain, block_count, i: dword;
  87.   pData, pKey: PDWord;
  88. begin
  89.   p := data;
  90.   remain := 64 - state.position;
  91.  
  92.   if remain > 0 then begin
  93.     if remain >= 4 then begin
  94.       pData := PDWord(p);
  95.       pKey := @state.keystream[state.position div 4];
  96.       for i := 0 to (remain div 4) - 1 do begin
  97.         pData[i] := pData[i] xor pKey[i];
  98.       end;
  99.       inc(p, (remain div 4) * 4);
  100.       dec(len, (remain div 4) * 4);
  101.       inc(state.position, (remain div 4) * 4);
  102.       remain := remain mod 4;
  103.     end;
  104.  
  105.     if remain > 0 then begin
  106.       if remain > len then remain := len;
  107.       for i := 0 to remain - 1 do
  108.         p[i] := p[i] xor pbyte(@state.keystream[state.position + i])^;
  109.       inc(p, remain);
  110.       dec(len, remain);
  111.       inc(state.position, remain);
  112.     end;
  113.   end;
  114.  
  115.   block_count := len div 64;
  116.   if block_count > 0 then begin
  117.     pData := PDWord(p);
  118.     for i := 0 to block_count - 1 do begin
  119.       if state.position >= 64 then
  120.         chacha20_next_block(state);
  121.       pKey := @state.keystream[0];
  122.  
  123.       pData[0] := pData[0] xor pKey[0];
  124.       pData[1] := pData[1] xor pKey[1];
  125.       pData[2] := pData[2] xor pKey[2];
  126.       pData[3] := pData[3] xor pKey[3];
  127.       pData[4] := pData[4] xor pKey[4];
  128.       pData[5] := pData[5] xor pKey[5];
  129.       pData[6] := pData[6] xor pKey[6];
  130.       pData[7] := pData[7] xor pKey[7];
  131.       pData[8] := pData[8] xor pKey[8];
  132.       pData[9] := pData[9] xor pKey[9];
  133.       pData[10] := pData[10] xor pKey[10];
  134.       pData[11] := pData[11] xor pKey[11];
  135.       pData[12] := pData[12] xor pKey[12];
  136.       pData[13] := pData[13] xor pKey[13];
  137.       pData[14] := pData[14] xor pKey[14];
  138.       pData[15] := pData[15] xor pKey[15];
  139.  
  140.       inc(pData, 16);
  141.       state.position := 64;
  142.     end;
  143.     len := len mod 64;
  144.     p := PByte(pData);
  145.   end;
  146.  
  147.   if len > 0 then begin
  148.     if state.position >= 64 then
  149.       chacha20_next_block(state);
  150.     if len >= 4 then begin
  151.       pData := PDWord(p);
  152.       pKey := @state.keystream[state.position div 4];
  153.       for i := 0 to (len div 4) - 1 do begin
  154.         pData[i] := pData[i] xor pKey[i];
  155.       end;
  156.       inc(p, (len div 4) * 4);
  157.       dec(len, (len div 4) * 4);
  158.       inc(state.position, (len div 4) * 4);
  159.     end;
  160.     for i := 0 to len - 1 do
  161.       p[i] := p[i] xor pbyte(@state.keystream[state.position + i])^;
  162.     inc(state.position, len);
  163.   end;
  164. end;
  165.  
  166. const
  167.   size = 1024*1024*1024;
  168.  
  169. var
  170.   cc: chacha20state;
  171.   data: pointer;
  172.   start, end_: qword;
  173.  
  174. begin
  175.   chacha20_init(cc, DupeString('a', 32), DupeString('b', 12));
  176.   data := getmem(size);
  177.   start := GetTickCount64;
  178.   chacha20_xor(cc, data, size);
  179.   end_ := GetTickCount64;
  180.   writeln('FPC 3.2.2 Unrolled  ',end_-start, ' ms');
  181.   writeln('verify = ', pint32(@cc.keystream)^);
  182.   writeln(Format('%.2f MB/s', [(size/(1024*1024))/((end_-start)/1000)]));
  183.   readln;
  184. end.
  185.  

Martin_fr

  • Administrator
  • Hero Member
  • *
  • Posts: 11333
  • Debugger - SynEdit - and more
    • wiki
Re: FPC for high-performance computing
« Reply #16 on: May 18, 2025, 05:54:13 pm »
Code: Text  [Select][+][-]
  1. FPC 3.2.2 Unrolled  2578 ms
  2. verify = 17398848
  3. 397.21 MB/s

I tested his code with trunk / O3 and {$Optimization noLOOPUNROLL} to make sure trunk hadn't simply done more unroll already. But that made no difference.

Btw Core I7-8700K




I was actually surprised of the huge improvement. I know trunk had some work done. But in other tests, the gains were around 4% to 5%.

From what I have seen (commits / mail list) a lot of work went into the peephole optimizer. That are usually local optimizations (register usages over short bits of code, avoid storing to mem, change 2 statements to avoid wait cycles...)

I don't know if there was any work on other optimization parts such as DFA and the overall register allocation.

LV

  • Sr. Member
  • ****
  • Posts: 286
Re: FPC for high-performance computing
« Reply #17 on: May 18, 2025, 06:00:46 pm »
Thank you very much, I am delighted with the work of the FPC-Lazarus team. :)

Jorg3000

  • Jr. Member
  • **
  • Posts: 76
Re: FPC for high-performance computing
« Reply #18 on: May 18, 2025, 06:18:34 pm »
Is there a speed change with the following code change?

*EDIT*  Oops, this is based on the code from page 1 and may already be outdated.

Code: Pascal  [Select][+][-]
  1. procedure chacha20_xor(var state: chacha20state; data: pointer; len: dword);
  2. var
  3.   i: dword;
  4.   p: PByte;
  5.   pKey0: Pointer;
  6. begin
  7.   p := data;
  8.   pKey0 := @state.keystream[0];
  9.  
  10.   for i := 0 to len-1 do begin
  11.     if state.position >= 64 then chacha20_next_block(state);
  12.     p^ := p^ xor pbyte(pKey0 + state.position)^;
  13.     inc(p);
  14.     inc(state.position);
  15.   end;
  16. end;


« Last Edit: May 18, 2025, 06:21:59 pm by Jorg3000 »

ALLIGATOR

  • Full Member
  • ***
  • Posts: 177
Re: FPC for high-performance computing
« Reply #19 on: May 18, 2025, 07:32:44 pm »
I commented out that section of code completely and the Verify value did not change. How can I verify that the algorithm is working correctly?

https://forum.lazarus.freepascal.org/index.php/topic,71174.msg555295.html#msg555295
Code: Pascal  [Select][+][-]
  1.       //pData[0] := pData[0] xor pKey[0];
  2.       //pData[1] := pData[1] xor pKey[1];
  3.       //pData[2] := pData[2] xor pKey[2];
  4.       //pData[3] := pData[3] xor pKey[3];
  5.       //pData[4] := pData[4] xor pKey[4];
  6.       //pData[5] := pData[5] xor pKey[5];
  7.       //pData[6] := pData[6] xor pKey[6];
  8.       //pData[7] := pData[7] xor pKey[7];
  9.       //pData[8] := pData[8] xor pKey[8];
  10.       //pData[9] := pData[9] xor pKey[9];
  11.       //pData[10] := pData[10] xor pKey[10];
  12.       //pData[11] := pData[11] xor pKey[11];
  13.       //pData[12] := pData[12] xor pKey[12];
  14.       //pData[13] := pData[13] xor pKey[13];
  15.       //pData[14] := pData[14] xor pKey[14];
  16.       //pData[15] := pData[15] xor pKey[15];
« Last Edit: May 18, 2025, 07:56:01 pm by ALLIGATOR »

Fibonacci

  • Hero Member
  • *****
  • Posts: 754
  • Internal Error Hunter
Re: FPC for high-performance computing
« Reply #20 on: May 18, 2025, 07:55:58 pm »
You commented out data encryption (xoring), while the verify value is the first 4 bytes of the last 64-byte keystream block.

But I checked the data and it is correct, same output as libsodium.

Here with data verification, the last 4 bytes:

Code: Pascal  [Select][+][-]
  1. begin
  2.   chacha20_init(cc, DupeString('a', 32), DupeString('b', 12));
  3.   data := getmem(size);
  4.   fillchar(data^, size, 'x');
  5.   start := GetTickCount64;
  6.   chacha20_xor(cc, data, size);
  7.   end_ := GetTickCount64;
  8.   writeln(end_-start, ' ms');
  9.   writeln('verify = ', pint32(@cc.keystream)^);
  10.   writeln('verify2 = ', pint32(data+size-4)^);
  11.   writeln(Format('%.2f MB/s', [(size/(1024*1024))/((end_-start)/1000)]));
  12.   readln;
  13. end.



Code: Text  [Select][+][-]
  1. FPC 3.3.1 unrolled: 2765 ms verify2 = -643586287  370,34 MB/s
  2. FPC+libsodium:       547 ms verify2 = -643586287 1872,03 MB/s



@Jorg3000: no change at all
« Last Edit: May 18, 2025, 11:11:18 pm by Fibonacci »

LV

  • Sr. Member
  • ****
  • Posts: 286
Re: FPC for high-performance computing
« Reply #21 on: May 28, 2025, 10:36:04 pm »
Perhaps someone will find this topic interesting.  ;)
I compare implementations of a 1D linear transport equation solver using the explicit upwind scheme:
Domain: 1.0 (discretized into 1,000,000 grid points)
Wave speed: c = 1.0
CFL condition: 0.9
Simulation time: 0.01.

Win11; i7-8700
Code: Text  [Select][+][-]
  1. -------------------------------------------------------------------------
  2.             | GFortan & OpenMP   |    C++ & OpenMP    | FPC & TThread   |
  3.             |  (GCC 14.2.0 -O3)  |   (GCC 14.2.0 -O3) |   3.2.2 (-O3)   |
  4. -------------------------------------------------------------------------
  5. ExTime,  s  |       10.7         |          10.7      |      10.3       |
  6. -------------------------------------------------------------------------
  7.  

Source codes in the appendix.
« Last Edit: May 28, 2025, 10:37:58 pm by LV »

LV

  • Sr. Member
  • ****
  • Posts: 286
Re: FPC for high-performance computing
« Reply #22 on: May 31, 2025, 01:48:47 pm »
Today, I switched to a laptop with an i7-12700H processor and ran the task from the previous post with an increased simulation time of 0.1. The results puzzled me - FPC is twice as fast as C++ GCC.  :o

output (C++ & OpenMP GCC 14.2.0 -O3):

Code: Text  [Select][+][-]
  1. Total steps: 111112
  2. Execution time: 82.014 seconds
  3. Press Enter to exit...
  4.  

output (FPC & TThread 3.2.2 -O3):

Code: Text  [Select][+][-]
  1. Enter number of threads: 16
  2. Total steps: 111112
  3. Execution time: 38.69 seconds
  4. Press Enter to exit...
  5.  

srvaldez

  • Full Member
  • ***
  • Posts: 130
Re: FPC for high-performance computing
« Reply #23 on: May 31, 2025, 03:51:43 pm »
my times
Code: [Select]
fortran
Total steps: 111112
gfortran    Execution time:    115.232 seconds, 92.776 seconds with openmp
Intel ifort Execution time:     87.984 seconds
Intel ifort Execution time:     48.819 seconds with Qiopenmp

cpp
Total steps: 111112
g+++  Execution time: 109.546 seconds, 61.624 seconds with openmp
Intel Execution time: 89.768 seconds
Intel Execution time: 38.194 seconds with Qiopenmp

pascal
Enter number of threads: 16
Total steps: 111112
Execution time: 61.19 seconds
« Last Edit: May 31, 2025, 04:05:15 pm by srvaldez »

marcov

  • Administrator
  • Hero Member
  • *
  • Posts: 12262
  • FPC developer.
Re: FPC for high-performance computing
« Reply #24 on: May 31, 2025, 05:03:12 pm »
Today, I switched to a laptop with an i7-12700H processor and ran the task from the previous post with an increased simulation time of 0.1. The results puzzled me - FPC is twice as fast as C++ GCC.  :o

It might be the difference of the  thread pool system based on recreating vs recycling worker threads. That might have some influence on which core  a job is initially scheduled

Thaddy

  • Hero Member
  • *****
  • Posts: 17148
  • Ceterum censeo Trump esse delendam
Re: FPC for high-performance computing
« Reply #25 on: May 31, 2025, 09:31:18 pm »
What puzzles me is that I do not immediately see optimization settings.
Especially on the Pascal side -CfXXX and -Sv
As we all know the FPC defaults are really conservative.
Due to censorship, I changed this to "Nelly the Elephant". Keeps the message clear.

LV

  • Sr. Member
  • ****
  • Posts: 286
Re: FPC for high-performance computing
« Reply #26 on: June 01, 2025, 06:43:27 pm »
@srvaldez, thank you for testing. Which hardware did you use to obtain these results? @marcov and @Thaddy, I appreciate your helpful advice. There is a lot to consider.

srvaldez

  • Full Member
  • ***
  • Posts: 130
Re: FPC for high-performance computing
« Reply #27 on: June 01, 2025, 07:24:01 pm »
@srvaldez, thank you for testing. Which hardware did you use to obtain these results?
Quote
OS Name   Microsoft Windows 11 Pro
System Type   x64-based PC
Processor   Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz, 3600 Mhz, 8 Core(s), 16 Logical Processor(s)

gues1

  • Jr. Member
  • **
  • Posts: 52
Re: FPC for high-performance computing
« Reply #28 on: June 01, 2025, 09:44:14 pm »
Enter number of threads: 16
Total steps: 11112
Execution time: 2.29 seconds
Press Enter to exit...

Win11, I9-14900HX- 8 PCores, 16 ECores
But with Delphi 12.3, 64 bit app, Release mode, I don't have Lazarus now to test.

gues1

  • Jr. Member
  • **
  • Posts: 52
Re: FPC for high-performance computing
« Reply #29 on: June 01, 2025, 10:00:48 pm »
Enter number of threads: 16
Total steps: 11112
Execution time: 2.31 seconds
Press Enter to exit...

Win11, I9-14900HX- 8 PCores, 16 ECores
With Lazarus 4.0, FPC 3.2.2, Optiomization -O3

 

TinyPortal © 2005-2018