Recent

Author Topic: Benchmark swap endianness of buffer  (Read 548 times)

LemonParty

  • Hero Member
  • *****
  • Posts: 526
Benchmark swap endianness of buffer
« on: May 22, 2026, 04:15:54 pm »
Here is a benchmark where tested swaping endianness of LongInt buffer.

In benchmark 4 functions:
1. Simplest realization with simple loop (SwapEndianSimple);
2. Loop unrolled realization (SwapEndianUnroll);
3. SSSE3 realization (SwapEndianSSSE3);
4. AVX2 realization (SwapEndianAVX2).

As asked in previous benchmark test runned few interations. And in results we see general time. The benchmark was splited in "32 KB" category, where we use only 32 KB on input buffer (so should fit into L1 cache) and 1024 * 1024 elements category.

Test 64 bit only. Written also code for UNIX, but not tested.

The benchmark:
Code: Pascal  [Select][+][-]
  1. {$mode objfpc}{$H+}
  2. {$inline on}
  3. {$If Defined(CPU386) OR Defined(CPUX64)}
  4.   {$ASMMODE intel}
  5. {$EndIf}
  6. {$SMARTLINK ON}
  7. {$Calling Register}
  8. {$CodeAlign proc=32}
  9. {$CodeAlign loop=32}
  10.  
  11. uses stopwatch;
  12.  
  13. procedure SwapEndianSimple(var Buf: LongInt; Count: SizeUInt);
  14. var
  15.   aBuf: array [Byte] of LongInt absolute Buf;
  16.   i: SizeUInt;
  17. begin
  18.   for i:= 1 to Count do
  19.     aBuf[i]:= SwapEndian(aBuf[i]);
  20. end;
  21.  
  22. procedure SwapEndianUnroll(var Buf: LongInt; Count: SizeUInt);
  23. var
  24.   aBuf: array [Byte] of LongInt absolute Buf;
  25.   i, Top: SizeUInt;
  26. begin
  27.   Top:= Count and -4;
  28.   i:= Low(aBuf);
  29.  
  30.   while i < Top do begin
  31.     aBuf[i]:= SwapEndian(aBuf[i]);
  32.     aBuf[i+1]:= SwapEndian(aBuf[i+1]);
  33.     aBuf[i+2]:= SwapEndian(aBuf[i+2]);
  34.     aBuf[i+3]:= SwapEndian(aBuf[i+3]);
  35.     Inc(i, 4);
  36.   end;
  37.  
  38.   if (Count and 2) <> 0 then begin
  39.     aBuf[i]:= SwapEndian(aBuf[i]);
  40.     aBuf[i+1]:= SwapEndian(aBuf[i+1]);
  41.     Inc(i, 2);
  42.   end;
  43.  
  44.   if (Count and 1) <> 0 then begin
  45.     aBuf[i]:= SwapEndian(aBuf[i]);
  46.   end;
  47. end;
  48.  
  49. procedure SwapEndianSSSE3(var Buf: LongInt; Count: SizeUInt);assembler;
  50. const
  51.   CSwapOrder: array [0..15] of Byte = (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
  52. asm
  53. {$If Defined(Windows)}
  54. movuPS xmm4,rip[CSwapOrder]
  55. cmp Count,16
  56.  jb @LB08
  57.  
  58. @Big:
  59. @LP0:
  60.  movuPS xmm0,[Buf]
  61.  movuPS xmm1,[Buf+16]
  62.  movuPS xmm2,[Buf+32]
  63.  movuPS xmm3,[Buf+48]
  64.  pshufB xmm0,xmm4
  65.  pshufB xmm1,xmm4
  66.  pshufB xmm2,xmm4
  67.  pshufB xmm3,xmm4
  68.  movuPS [Buf],xmm0
  69.  movuPS [Buf+16],xmm1
  70.  movuPS [Buf+32],xmm2
  71.  movuPS [Buf+48],xmm3
  72.  sub Count,16
  73.  add Buf,64
  74.  cmp Count,16
  75.   jae @LP0
  76.  
  77. @LB08:
  78. test dl,8
  79.  jz @LB04
  80. movuPS xmm0,[Buf]
  81. movuPS xmm1,[Buf+16]
  82. pshufB xmm0,xmm4
  83. pshufB xmm1,xmm4
  84. movuPS [Buf],xmm0
  85. movuPS [Buf+16],xmm1
  86. add Buf,32
  87.  
  88. @LB04:
  89. test dl,4
  90.  jz @LB02
  91. movuPS xmm0,[Buf]
  92. pshufB xmm0,xmm4
  93. movuPS [Buf],xmm0
  94. add Buf,16
  95.  
  96. @LB02:
  97. test dl,2
  98.  jz @LB01
  99. movQ xmm0,[Buf]
  100. pshufB xmm0,xmm4
  101. movQ [Buf],xmm0
  102. add Buf,8
  103.  
  104. @LB01:
  105. test dl,1
  106.  jz @LB00
  107. mov edi,dword ptr[Buf]
  108. bswap edi
  109. mov dword ptr[Buf],edi
  110. @LB00:
  111.  
  112. @Fin:
  113. {$Else}{UNIX}
  114. movuPS xmm4,rip[CSwapOrder]
  115. cmp Count,16
  116.  jb @LB08
  117.  
  118. @Big:
  119. @LP0:
  120.  movuPS xmm0,[Buf]
  121.  movuPS xmm1,[Buf+16]
  122.  movuPS xmm2,[Buf+32]
  123.  movuPS xmm3,[Buf+48]
  124.  pshufB xmm0,xmm4
  125.  pshufB xmm1,xmm4
  126.  pshufB xmm2,xmm4
  127.  pshufB xmm3,xmm4
  128.  movuPS [Buf],xmm0
  129.  movuPS [Buf+16],xmm1
  130.  movuPS [Buf+32],xmm2
  131.  movuPS [Buf+48],xmm3
  132.  sub Count,16
  133.  add Buf,64
  134.  cmp Count,16
  135.   jae @LP0
  136.  
  137. @LB08:
  138. test si,8
  139.  jz @LB04
  140. movuPS xmm0,[Buf]
  141. movuPS xmm1,[Buf+16]
  142. pshufB xmm0,xmm4
  143. pshufB xmm1,xmm4
  144. movuPS [Buf],xmm0
  145. movuPS [Buf+16],xmm1
  146. add Buf,32
  147.  
  148. @LB04:
  149. test si,4
  150.  jz @LB02
  151. movuPS xmm0,[Buf]
  152. pshufB xmm0,xmm4
  153. movuPS [Buf],xmm0
  154. add Buf,16
  155.  
  156. @LB02:
  157. test si,2
  158.  jz @LB01
  159. movQ xmm0,[Buf]
  160. pshufB xmm0,xmm4
  161. movQ [Buf],xmm0
  162. add Buf,8
  163.  
  164. @LB01:
  165. test si,1
  166.  jz @LB00
  167. mov eax,dword ptr[Buf]
  168. bswap eax
  169. mov dword ptr[Buf],eax
  170. @LB00:
  171.  
  172. @Fin:
  173. {$EndIf}
  174. end;
  175.  
  176. procedure SwapEndianAVX2(var Buf: LongInt; Count: SizeUInt);assembler;
  177. const
  178.   CSwapOrder: array [0..31] of Byte =
  179.   (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
  180.   19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28);
  181. asm
  182. {$If Defined(Windows)}
  183. vmovuPS ymm2,rip[CSwapOrder]
  184. cmp Count,16
  185.  jb @LB08
  186.  
  187. @Big:
  188. @LP0:
  189.  vmovuPS ymm0,[Buf]
  190.  vmovuPS ymm1,[Buf+32]
  191.  vpshufB ymm0,ymm0,ymm2
  192.  vpshufB ymm1,ymm1,ymm2
  193.  vmovuPS [Buf],ymm0
  194.  vmovuPS [Buf+32],ymm1
  195.  sub Count,16
  196.  add Buf,64
  197.  cmp Count,16
  198.   jae @LP0
  199.  
  200. @LB08:
  201. test dl,8
  202.  jz @LB04
  203. vmovuPS ymm0,[Buf]
  204. vpshufB ymm0,ymm0,ymm2
  205. vmovuPS [Buf],ymm0
  206. add Buf,32
  207.  
  208. @LB04:
  209. vzeroupper
  210. test dl,4
  211.  jz @LB02
  212. movuPS xmm0,[Buf]
  213. pshufB xmm0,xmm2
  214. movuPS [Buf],xmm0
  215. add Buf,16
  216.  
  217. @LB02:
  218. test dl,2
  219.  jz @LB01
  220. movQ xmm0,[Buf]
  221. pshufB xmm0,xmm2
  222. movQ [Buf],xmm0
  223. add Buf,8
  224.  
  225. @LB01:
  226. test dl,1
  227.  jz @LB00
  228. mov edi,dword ptr[Buf]
  229. bswap edi
  230. mov dword ptr[Buf],edi
  231. @LB00:
  232.  
  233. @Fin:
  234. {$Else}{UNIX}
  235. vmovuPS ymm2,rip[CSwapOrder]
  236. cmp Count,16
  237.  jb @LB08
  238.  
  239. @Big:
  240. @LP0:
  241.  vmovuPS ymm0,[Buf]
  242.  vmovuPS ymm1,[Buf+32]
  243.  vpshufB ymm0,ymm0,ymm2
  244.  vpshufB ymm1,ymm1,ymm2
  245.  vmovuPS [Buf],ymm0
  246.  vmovuPS [Buf+32],ymm1
  247.  sub Count,16
  248.  add Buf,64
  249.  cmp Count,16
  250.   jae @LP0
  251.  
  252. @LB08:
  253. test si,8
  254.  jz @LB04
  255. vmovuPS ymm0,[Buf]
  256. vpshufB ymm0,ymm0,ymm2
  257. vmovuPS [Buf],ymm0
  258. add Buf,32
  259.  
  260. @LB04:
  261. vzeroupper
  262. test si,4
  263.  jz @LB02
  264. movuPS xmm0,[Buf]
  265. pshufB xmm0,xmm2
  266. movuPS [Buf],xmm0
  267. add Buf,16
  268.  
  269. @LB02:
  270. test si,2
  271.  jz @LB01
  272. movQ xmm0,[Buf]
  273. pshufB xmm0,xmm2
  274. movQ [Buf],xmm0
  275. add Buf,8
  276.  
  277. @LB01:
  278. test si,1
  279.  jz @LB00
  280. mov eax,dword ptr[Buf]
  281. bswap eax
  282. mov dword ptr[Buf],eax
  283. @LB00:
  284.  
  285. @Fin:
  286. {$EndIf}
  287. end;
  288.  
  289. const
  290.   CSize = 1024 * 1024;
  291.   {elements in L1 cache line}
  292.   CCacheSize = (32 * 1024) div SizeOf(LongInt);
  293.   CLoopItersSmall = 100;
  294.   CLoopItersBig = 4;
  295. var
  296.   i: SizeInt;
  297.   sw: TStopWatch;
  298.   pL: PLongInt;
  299. begin
  300.   sw:= TStopWatch.Create;
  301.  
  302.   pL:= GetMem(CSize * SizeOf(LongInt));
  303.  
  304.   Writeln(CCacheSize, ' ELEMENTS BY ', CLoopItersSmall, ' RESULTS');
  305.  
  306.   FillDWord(pL^, CCacheSize, $01020304);{warm up the cache}
  307.  
  308.   sw.Reset; sw.Start;
  309.   for i:= 1 to CLoopItersSmall do
  310.     SwapEndianSimple(pL^, CCacheSize);
  311.   sw.Stop;
  312.   Writeln('Simple    : ', sw.ElapsedTicks);
  313.  
  314.   sw.Reset; sw.Start;
  315.   for i:= 1 to CLoopItersSmall do
  316.     SwapEndianUnroll(pL^, CCacheSize);
  317.   sw.Stop;
  318.   Writeln('Unroll    : ', sw.ElapsedTicks);
  319.  
  320.   sw.Reset; sw.Start;
  321.   for i:= 1 to CLoopItersSmall do
  322.     SwapEndianSSSE3(pL^, CCacheSize);
  323.   sw.Stop;
  324.   Writeln('SSSE3     : ', sw.ElapsedTicks);
  325.  
  326.   sw.Reset; sw.Start;
  327.   for i:= 1 to CLoopItersSmall do
  328.     SwapEndianAVX2(pL^, CCacheSize);
  329.   sw.Stop;
  330.   Writeln('AVX2      : ', sw.ElapsedTicks);
  331.  
  332.  
  333.   Writeln(CSize, ' ELEMENTS BY ', CLoopItersBig, ' RESULTS');
  334.  
  335.   FillDWord(pL^, CSize, $01020304);{warm up the cache}
  336.  
  337.   sw.Reset; sw.Start;
  338.   for i:= 1 to CLoopItersBig do
  339.     SwapEndianSimple(pL^, CSize);
  340.   sw.Stop;
  341.   Writeln('Simple    : ', sw.ElapsedTicks);
  342.  
  343.   sw.Reset; sw.Start;
  344.   for i:= 1 to CLoopItersBig do
  345.     SwapEndianUnroll(pL^, CSize);
  346.   sw.Stop;
  347.   Writeln('Unroll    : ', sw.ElapsedTicks);
  348.  
  349.   sw.Reset; sw.Start;
  350.   for i:= 1 to CLoopItersBig do
  351.     SwapEndianSSSE3(pL^, CSize);
  352.   sw.Stop;
  353.   Writeln('SSSE3     : ', sw.ElapsedTicks);
  354.  
  355.   sw.Reset; sw.Start;
  356.   for i:= 1 to CLoopItersBig do
  357.     SwapEndianAVX2(pL^, CSize);
  358.   sw.Stop;
  359.   Writeln('AVX2      : ', sw.ElapsedTicks);
  360. end.

Results:
Code: Pascal  [Select][+][-]
  1. 8192 ELEMENTS BY 100 RESULTS
  2. Simple    : 8511
  3. Unroll    : 7509
  4. SSSE3     : 610
  5. AVX2      : 396
  6. 1048576 ELEMENTS BY 4 RESULTS
  7. Simple    : 41141
  8. Unroll    : 47895
  9. SSSE3     : 10249
  10. AVX2      : 8998
  11.  
  12. 8192 ELEMENTS BY 100 RESULTS
  13. Simple    : 7771
  14. Unroll    : 7516
  15. SSSE3     : 658
  16. AVX2      : 499
  17. 1048576 ELEMENTS BY 4 RESULTS
  18. Simple    : 46116
  19. Unroll    : 42289
  20. SSSE3     : 8328
  21. AVX2      : 8982
  22.  
  23. 8192 ELEMENTS BY 100 RESULTS
  24. Simple    : 7884
  25. Unroll    : 9098
  26. SSSE3     : 511
  27. AVX2      : 488
  28. 1048576 ELEMENTS BY 4 RESULTS
  29. Simple    : 52158
  30. Unroll    : 42553
  31. SSSE3     : 7835
  32. AVX2      : 9188
  33.  
  34. 8192 ELEMENTS BY 100 RESULTS
  35. Simple    : 7585
  36. Unroll    : 7585
  37. SSSE3     : 902
  38. AVX2      : 455
  39. 1048576 ELEMENTS BY 4 RESULTS
  40. Simple    : 46307
  41. Unroll    : 42041
  42. SSSE3     : 8435
  43. AVX2      : 9131
  44.  
  45. 8192 ELEMENTS BY 100 RESULTS
  46. Simple    : 7694
  47. Unroll    : 8136
  48. SSSE3     : 621
  49. AVX2      : 496
  50. 1048576 ELEMENTS BY 4 RESULTS
  51. Simple    : 47063
  52. Unroll    : 49698
  53. SSSE3     : 8499
  54. AVX2      : 9932
Lazarus v. 4.99. FPC v. 3.3.1. Windows 11

LemonParty

  • Hero Member
  • *****
  • Posts: 526
Re: Benchmark swap endianness of buffer
« Reply #1 on: May 22, 2026, 04:16:56 pm »
stopwatch.pas
Lazarus v. 4.99. FPC v. 3.3.1. Windows 11

Thaddy

  • Hero Member
  • *****
  • Posts: 19249
  • Glad to be alive.
Re: Benchmark swap endianness of buffer
« Reply #2 on: May 23, 2026, 06:10:04 am »
Test 64 bit only. Written also code for UNIX, but not tested.
Why would that in any way differ?  Serious answer please.
Also, did you test the network byte order code as well?
BEtoN/NtoBE to name some possibilities.
https://www.freepascal.org/docs-html/rtl/system/swapendian.html and its summary.
Also note that I see no alignment instructions where you are using SIMD. (common mistake)
What is the - processor - cache behavior and did you allow for that? (Even more common mistake)
What does the disassembly from the native Pascal solution show on higher optimization level? (-al option, *.s file)
You left that out.
Can you also specify the processor and family? Because e.g. there can be differences between Intel and AMD and these can be considerable even if they  support the same instructions.

I actually suspect that the compiler does a better job than hand-written assembler code, because the compiler takes into account some of my points. It may look less optimized, but I know from experience that is often not the case when you time the code correctly.
« Last Edit: May 23, 2026, 06:20:22 am by Thaddy »
objects are fine constructs. You can even initialize them with constructors.

LemonParty

  • Hero Member
  • *****
  • Posts: 526
Re: Benchmark swap endianness of buffer
« Reply #3 on: May 23, 2026, 10:36:13 am »
Here are results with align 32 for assembler loops:
Code: Pascal  [Select][+][-]
  1. 8192 ELEMENTS BY 100 RESULTS
  2. Simple    : 10442
  3. Unroll    : 8333
  4. SSSE3     : 661
  5. AVX2      : 410
  6. 1048576 ELEMENTS BY 4 RESULTS
  7. Simple    : 43406
  8. Unroll    : 44813
  9. SSSE3     : 7851
  10. AVX2      : 9329
  11.  
  12. 8192 ELEMENTS BY 100 RESULTS
  13. Simple    : 8328
  14. Unroll    : 7742
  15. SSSE3     : 618
  16. AVX2      : 493
  17. 1048576 ELEMENTS BY 4 RESULTS
  18. Simple    : 42765
  19. Unroll    : 43038
  20. SSSE3     : 7820
  21. AVX2      : 8510
  22.  
  23. 8192 ELEMENTS BY 100 RESULTS
  24. Simple    : 7541
  25. Unroll    : 12080
  26. SSSE3     : 578
  27. AVX2      : 355
  28. 1048576 ELEMENTS BY 4 RESULTS
  29. Simple    : 41936
  30. Unroll    : 43036
  31. SSSE3     : 8311
  32. AVX2      : 8810
  33.  
  34. 8192 ELEMENTS BY 100 RESULTS
  35. Simple    : 10074
  36. Unroll    : 7715
  37. SSSE3     : 665
  38. AVX2      : 466
  39. 1048576 ELEMENTS BY 4 RESULTS
  40. Simple    : 46391
  41. Unroll    : 42099
  42. SSSE3     : 8016
  43. AVX2      : 9378
  44.  
  45. 8192 ELEMENTS BY 100 RESULTS
  46. Simple    : 8347
  47. Unroll    : 7542
  48. SSSE3     : 646
  49. AVX2      : 492
  50. 1048576 ELEMENTS BY 4 RESULTS
  51. Simple    : 46416
  52. Unroll    : 48360
  53. SSSE3     : 8453
  54. AVX2      : 8895
Not suspected, but results is little better that without aligment.

Quote
Why would that in any way differ?  Serious answer please.
Windows and UNIX uses different calling convention. But really in this case I could write code that valid for both OS.

Here is assembler code of SwapEndianSimple:
Code: ASM  [Select][+][-]
  1. # Var i located in register rdi
  2.   movq  %rcx,%rbx
  3. # Var Buf located in register rbx
  4.   movq  %rdx,%rsi
  5. # Var Count located in register rsi
  6. # Var Count located in register rsi
  7. # [18] for i:= 1 to Count do
  8.   cmpq  $1,%rdx
  9.   jnae  .Lj6
  10.   xorl  %edi,%edi
  11.   .p2align 4,,10
  12.   .p2align 3
  13. .Lj7:
  14.   addq  $1,%rdi
  15. # [19] aBuf[i]:= SwapEndian(aBuf[i]);
  16.   movl  (%rbx,%rdi,4),%ecx
  17.   call  SYSTEM_$$_SWAPENDIAN$LONGINT$$LONGINT
  18.   movl  %eax,(%rbx,%rdi,4)
  19.   cmpq  %rdi,%rsi
  20.   jnbe  .Lj7
  21. .Lj6:
  22. # [20] end;
  23.   nop
  24.   leaq  32(%rsp),%rsp
  25.   popq  %rsi
  26. .Lc7:
  27.   popq  %rdi
  28. .Lc8:
  29.   popq  %rbx
  30. .Lc9:
  31.   ret

SwapEndianUnroll:
Code: ASM  [Select][+][-]
  1. # Var Buf located in register rbx
  2.   movq  %rdx,%rsi
  3. # Var Count located in register rsi
  4. # [27] Top:= Count and -4;
  5.   movq  %rdx,%rdi
  6.   andq  $-4,%rdi
  7. # Var Top located in register rdi
  8. # Var i located in register r12
  9. # [28] i:= Low(aBuf);
  10.   xorl  %r12d,%r12d
  11. # [30] while i < Top do begin
  12.   jmp  .Lj13
  13.   .p2align 4,,10
  14.   .p2align 3
  15. .Lj12:
  16. # [31] aBuf[i]:= SwapEndian(aBuf[i]);
  17.   movl  (%rbx,%r12,4),%ecx
  18.   call  SYSTEM_$$_SWAPENDIAN$LONGINT$$LONGINT
  19.   movl  %eax,(%rbx,%r12,4)
  20. # [32] aBuf[i+1]:= SwapEndian(aBuf[i+1]);
  21.   movl  4(%rbx,%r12,4),%ecx
  22.   call  SYSTEM_$$_SWAPENDIAN$LONGINT$$LONGINT
  23.   movl  %eax,4(%rbx,%r12,4)
  24. # [33] aBuf[i+2]:= SwapEndian(aBuf[i+2]);
  25.   movl  8(%rbx,%r12,4),%ecx
  26.   call  SYSTEM_$$_SWAPENDIAN$LONGINT$$LONGINT
  27.   movl  %eax,8(%rbx,%r12,4)
  28. # [34] aBuf[i+3]:= SwapEndian(aBuf[i+3]);
  29.   movl  12(%rbx,%r12,4),%ecx
  30.   call  SYSTEM_$$_SWAPENDIAN$LONGINT$$LONGINT
  31.   movl  %eax,12(%rbx,%r12,4)
  32. # [35] Inc(i, 4);
  33.   addq  $4,%r12
  34. .Lj13:
  35.   cmpq  %r12,%rdi
  36.   ja  .Lj12
  37. # [38] if (Count and 2) <> 0 then begin
  38.   testb  $2,%sil
  39.   je  .Lj16
  40. # [39] aBuf[i]:= SwapEndian(aBuf[i]);
  41.   movl  (%rbx,%r12,4),%ecx
  42.   call  SYSTEM_$$_SWAPENDIAN$LONGINT$$LONGINT
  43.   movl  %eax,(%rbx,%r12,4)
  44. # [40] aBuf[i+1]:= SwapEndian(aBuf[i+1]);
  45.   movl  4(%rbx,%r12,4),%ecx
  46.   call  SYSTEM_$$_SWAPENDIAN$LONGINT$$LONGINT
  47.   movl  %eax,4(%rbx,%r12,4)
  48. # [41] Inc(i, 2);
  49.   addq  $2,%r12
  50. .Lj16:
  51. # [44] if (Count and 1) <> 0 then begin
  52.   andq  $1,%rsi
  53.   je  .Lj18
  54. # [45] aBuf[i]:= SwapEndian(aBuf[i]);
  55.   movl  (%rbx,%r12,4),%ecx
  56.   call  SYSTEM_$$_SWAPENDIAN$LONGINT$$LONGINT
  57.   movl  %eax,(%rbx,%r12,4)
  58. .Lj18:
  59. # [47] end;
  60.   nop
  61.   leaq  40(%rsp),%rsp
  62.   popq  %r12
  63. .Lc17:
  64.   popq  %rsi
  65. .Lc18:
  66.   popq  %rdi
  67. .Lc19:
  68.   popq  %rbx
  69. .Lc20:
  70.   ret
I suspect that SwapEndian is intrinsic and uses bswap instruction under the hood. But it call a function.

Tests was done on Intel Core Ultra 7 258V.

Quote
I actually suspect that the compiler does a better job than hand-written assembler code, because the compiler takes into account some of my points. It may look less optimized, but I know from experience that is often not the case when you time the code correctly.
FPC have not support yet automatic vectorization. As tests show we have ~10x speed up on small chunks and ~4x on big chunks. So I assume in some cases writing assembler code is quite useful. The other case when in code present a lot of work with structures and many variables, in this case compiler may be better (and you will save a lot of time).
Lazarus v. 4.99. FPC v. 3.3.1. Windows 11

Warfley

  • Hero Member
  • *****
  • Posts: 2066
Re: Benchmark swap endianness of buffer
« Reply #4 on: May 23, 2026, 06:14:45 pm »
FPC isn't that great when it comes to optimization. What would be interesting to compare this to the FPC LLVM backend and maybe even to equivalent C/C++ code (also using llvm).

From a practical point of view I must admit that knowing that FPC is bad at optimization, it's not a language I'd use for high performance code. Rather than having to write assembly, I'd just switch to C or C++ for those parts of the project

marcov

  • Administrator
  • Hero Member
  • *
  • Posts: 12894
  • FPC developer.
Re: Benchmark swap endianness of buffer
« Reply #5 on: May 23, 2026, 08:14:10 pm »
FPC isn't that great when it comes to optimization. What would be interesting to compare this to the FPC LLVM backend and maybe even to equivalent C/C++ code (also using llvm).

From a practical point of view I must admit that knowing that FPC is bad at optimization, it's not a language I'd use for high performance code. Rather than having to write assembly, I'd just switch to C or C++ for those parts of the project

I use the simd library ( https://ermig1979.github.io/Simd/index.html )  for prototyping. For production I have to write something myself (usually a sequence of operations, saving on intermediate storage/mem bandwidth). Currently I mostly go back to assembler for that.

I mostly used it for color distance filtering (a single linear distance between two RGB pixels using HSV)
« Last Edit: May 24, 2026, 11:52:59 pm by marcov »

LemonParty

  • Hero Member
  • *****
  • Posts: 526
Re: Benchmark swap endianness of buffer
« Reply #6 on: May 23, 2026, 08:32:40 pm »
The simd library has dynamic libraries? And wrapper for Pascal?
Lazarus v. 4.99. FPC v. 3.3.1. Windows 11

marcov

  • Administrator
  • Hero Member
  • *
  • Posts: 12894
  • FPC developer.
Re: Benchmark swap endianness of buffer
« Reply #7 on: May 23, 2026, 08:51:50 pm »
The simd library has dynamic libraries? And wrapper for Pascal?

I think I linked it into a DLL myself, and added pascal callable prototypes as needed. It was in VS2015. Also relative simple SIMD transformations on a single image type (like e.g. 8-bit BW, what I use a lot) are not that hard or time consuming.
« Last Edit: May 24, 2026, 11:51:21 pm by marcov »

LemonParty

  • Hero Member
  • *****
  • Posts: 526
Re: Benchmark swap endianness of buffer
« Reply #8 on: May 27, 2026, 10:30:56 am »
I updated my benchmark:
1. Added procedure SwapEndianAVX2x2 that uses 4 accumulators instead of 2.
2. Added simplest SwapEndianBswap procedure that utilize BSWAP insturtion.

Results of 5 runs:
Code: Pascal  [Select][+][-]
  1. 8192 ELEMENTS BY 100 RESULTS
  2. Simple    : 8641
  3. Unroll    : 8046
  4. Bswap     : 31885
  5. SSSE3     : 596
  6. AVX2      : 449
  7. AVX2x2    : 374
  8. 1048576 ELEMENTS BY 4 RESULTS
  9. Simple    : 41818
  10. Unroll    : 45237
  11. Bswap     : 138609
  12. SSSE3     : 10434
  13. AVX2      : 8395
  14. AVX2x2    : 9979
  15.  
  16. 8192 ELEMENTS BY 100 RESULTS
  17. Simple    : 7690
  18. Unroll    : 7492
  19. Bswap     : 26883
  20. SSSE3     : 646
  21. AVX2      : 413
  22. AVX2x2    : 644
  23. 1048576 ELEMENTS BY 4 RESULTS
  24. Simple    : 44298
  25. Unroll    : 41521
  26. Bswap     : 139984
  27. SSSE3     : 8049
  28. AVX2      : 8860
  29. AVX2x2    : 8662
  30.  
  31. 8192 ELEMENTS BY 100 RESULTS
  32. Simple    : 7516
  33. Unroll    : 10513
  34. Bswap     : 27209
  35. SSSE3     : 622
  36. AVX2      : 350
  37. AVX2x2    : 465
  38. 1048576 ELEMENTS BY 4 RESULTS
  39. Simple    : 53943
  40. Unroll    : 42721
  41. Bswap     : 139055
  42. SSSE3     : 8285
  43. AVX2      : 12236
  44. AVX2x2    : 8556
  45.  
  46. 8192 ELEMENTS BY 100 RESULTS
  47. Simple    : 7519
  48. Unroll    : 7500
  49. Bswap     : 27487
  50. SSSE3     : 640
  51. AVX2      : 423
  52. AVX2x2    : 344
  53. 1048576 ELEMENTS BY 4 RESULTS
  54. Simple    : 46707
  55. Unroll    : 42772
  56. Bswap     : 146639
  57. SSSE3     : 8268
  58. AVX2      : 8630
  59. AVX2x2    : 8484
  60.  
  61. 8192 ELEMENTS BY 100 RESULTS
  62. Simple    : 8814
  63. Unroll    : 8145
  64. Bswap     : 26905
  65. SSSE3     : 645
  66. AVX2      : 356
  67. AVX2x2    : 291
  68. 1048576 ELEMENTS BY 4 RESULTS
  69. Simple    : 60304
  70. Unroll    : 42540
  71. Bswap     : 152422
  72. SSSE3     : 8351
  73. AVX2      : 9190
  74. AVX2x2    : 8840
As you may see doubling accumulators increase speed further. But increasing amount of accumulators don't affect results on large chunks (we hit a bottleneck of memory speed).
I suppouse that BSWAP procedure will be faster than simple and unrolled procedures. But in turn to be extremely slow. Does anybody have an idea why it is so bad?
Lazarus v. 4.99. FPC v. 3.3.1. Windows 11

MathMan

  • Hero Member
  • *****
  • Posts: 515
Re: Benchmark swap endianness of buffer
« Reply #9 on: May 27, 2026, 02:05:39 pm »
I think i can. Take a look at your newly added function

Code: Pascal  [Select][+][-]
  1. procedure SwapEndianBswap(var Buf: LongInt; Count: SizeUInt);assembler;nostackframe;
  2. asm
  3.   test Count,Count
  4.    jz @Fin
  5.  
  6.   align 32
  7. @LP0:
  8.   mov eax,[Buf]
  9.   bswap eax
  10.   mov [Buf],eax
  11.   dec Count
  12.   lea Buf,[Buf+4]
  13.    jnz @LP0
  14.  
  15. @Fin:
  16. end;
  17.  

You retrieve the UInt32 via '[Buf]' and manipulate the pointer 'Buf' manually by adding 4 (=SizeOf( UInt32 ) ). This adds a dependency in the loop that slows down the execution.

You can lift that dependency e.g. with below variant (Win64 only)

Code: Pascal  [Select][+][-]
  1. procedure SwapEndianBswap(var Buf: LongInt; Count: SizeUInt);assembler;nostackframe;
  2. asm
  3.   jmp   @Check
  4.  
  5.   align 16
  6. @LP0:
  7.  
  8.   mov   eax,[Buf+4*RDX]
  9.   bswap eax
  10.   mov   dword ptr [Buf+4*RDX],eax
  11.  
  12. @Check:
  13.   sub   Count, 1
  14.   jnc   @LP0
  15. end;
  16.  

If you take a look (via debugger) how the other two functions are doing it, you'll see they use the same approach.

On my machine the above variant of 'SwapEndianBswap' is consistently faster than the other two.

LemonParty

  • Hero Member
  • *****
  • Posts: 526
Re: Benchmark swap endianness of buffer
« Reply #10 on: May 27, 2026, 03:23:20 pm »
MathMan, your code work suprisingly good.

I also added function SwapEndianBswapx2 that swaps 2 integers per cycle. It practically doubles the speed compare to SwapEndianBswap.

New results:
Code: Pascal  [Select][+][-]
  1. 8192 ELEMENTS BY 100 RESULTS
  2. Simple    : 7548
  3. Unroll    : 8160
  4. Bswap     : 3872
  5. Bswapx2   : 2193
  6. SSSE3     : 549
  7. AVX2      : 813
  8. AVX2x2    : 385
  9. 1048576 ELEMENTS BY 4 RESULTS
  10. Simple    : 51043
  11. Unroll    : 47174
  12. Bswap     : 19333
  13. Bswapx2   : 21746
  14. SSSE3     : 11924
  15. AVX2      : 8166
  16. AVX2x2    : 7955
  17.  
  18. 8192 ELEMENTS BY 100 RESULTS
  19. Simple    : 7685
  20. Unroll    : 8213
  21. Bswap     : 3749
  22. Bswapx2   : 2064
  23. SSSE3     : 549
  24. AVX2      : 351
  25. AVX2x2    : 293
  26. 1048576 ELEMENTS BY 4 RESULTS
  27. Simple    : 45640
  28. Unroll    : 41838
  29. Bswap     : 19945
  30. Bswapx2   : 13481
  31. SSSE3     : 7986
  32. AVX2      : 8160
  33. AVX2x2    : 7990
  34.  
  35. 8192 ELEMENTS BY 100 RESULTS
  36. Simple    : 7578
  37. Unroll    : 8902
  38. Bswap     : 3895
  39. Bswapx2   : 1961
  40. SSSE3     : 663
  41. AVX2      : 369
  42. AVX2x2    : 293
  43. 1048576 ELEMENTS BY 4 RESULTS
  44. Simple    : 44344
  45. Unroll    : 43201
  46. Bswap     : 19624
  47. Bswapx2   : 10460
  48. SSSE3     : 7271
  49. AVX2      : 11652
  50. AVX2x2    : 10907
  51.  
  52. 8192 ELEMENTS BY 100 RESULTS
  53. Simple    : 7553
  54. Unroll    : 7494
  55. Bswap     : 3749
  56. Bswapx2   : 2308
  57. SSSE3     : 616
  58. AVX2      : 379
  59. AVX2x2    : 294
  60. 1048576 ELEMENTS BY 4 RESULTS
  61. Simple    : 44795
  62. Unroll    : 44269
  63. Bswap     : 20628
  64. Bswapx2   : 10417
  65. SSSE3     : 7564
  66. AVX2      : 7631
  67. AVX2x2    : 9019
  68.  
  69. 8192 ELEMENTS BY 100 RESULTS
  70. Simple    : 12452
  71. Unroll    : 7572
  72. Bswap     : 3750
  73. Bswapx2   : 2106
  74. SSSE3     : 657
  75. AVX2      : 395
  76. AVX2x2    : 346
  77. 1048576 ELEMENTS BY 4 RESULTS
  78. Simple    : 42564
  79. Unroll    : 43259
  80. Bswap     : 19943
  81. Bswapx2   : 10458
  82. SSSE3     : 8060
  83. AVX2      : 8746
  84. AVX2x2    : 13367
Lazarus v. 4.99. FPC v. 3.3.1. Windows 11

MathMan

  • Hero Member
  • *****
  • Posts: 515
Re: Benchmark swap endianness of buffer
« Reply #11 on: May 27, 2026, 07:55:45 pm »
MathMan, your code work suprisingly good.

<snip>

I assumed as much. However - this is not a cure all solution! It only works if you target architecture has enough address calculation capacity to handle the increased amount of SIB addressing. Older x86-64 or loops with a lot of memory addressing will not gain from this approach - may even get slower.

 

TinyPortal © 2005-2018