Recent

Author Topic: Slow TStringList.Sort in Applications.  (Read 660 times)

Borneq

  • Full Member
  • ***
  • Posts: 248
Slow TStringList.Sort in Applications.
« on: May 02, 2022, 03:21:42 pm »
This forum is proper for this post? Post is about error/bad behavior of FPC.
I have tested sort time in FPC console, FPC applications and GCC release console. I have Linux Fedora and Laptop with core I3 20 GB RAM.
variable d means:
big value like 100 = most strings are different each with other
small value like 0.01 = a lot of the same string valiues
First FPC console compiled with aggressive optimalization
Code: Pascal  [Select][+][-]
  1. program timeSortCon;
  2.  
  3. uses Linux, unixtype, Classes, SysUtils, Math;
  4.  
  5.  
  6. procedure fill(list: TStringList; size: integer; d: double);
  7. var
  8.   i,n: integer;
  9. begin
  10.   list.Clear;
  11.   for i:=0 to size-1 do
  12.   begin
  13.     n:=Random(round(size*d));
  14.     list.Add(IntToStr(n));
  15.   end;
  16. end;
  17.  
  18. const c1e9 = 1000*1000*1000;
  19. var
  20.   tp1,tp2: ttimespec;
  21.   t1,t2: int64;
  22.   list:TStringList;
  23.   size: integer;
  24.   d: double;
  25. begin
  26.   list:=TStringList.Create;
  27.   size:=1024;
  28.   d:=100;
  29.   repeat
  30.      Fill(list, size, d);
  31.      clock_gettime(CLOCK_MONOTONIC, @tp1);
  32.      list.sort();
  33.      clock_gettime(CLOCK_MONOTONIC, @tp2);
  34.      t1:=tp1.tv_nsec+tp1.tv_sec*c1e9;
  35.      t2:=tp2.tv_nsec+tp2.tv_sec*c1e9;
  36.      writeln(FloatToStr((t2-t1)/Math.Log2(size)/size));
  37.      size:=size*2;
  38.   until size>4*1024*1024;
  39.   list.Free;
  40.   writeln('end');
  41. end.
  42.  
output is

33.86015625
33.3103249289773
33.4970092773438
35.7384127103365
37.3490033830915
38.675048828125
38.206618309021
42.8031939338235
45.9571946461995
51.8758956507633
53.9631856918335
62.9143598420279
69.6882724003358
end

id d = 0.01:

112.74697265625
104.522283380682
88.3057861328125
66.2308349609375
65.185782296317
66.95908203125
66.3638725280762
70.4515856574563
77.0949376424154
77.421782844945
80.1974610805511
87.4744812647502
91.7128844369542
end

A bit slower
Compare to C++:
Code: C  [Select][+][-]
  1. #include <iostream>
  2. #include <algorithm>
  3. #include <vector>
  4. #include <chrono>
  5. #include <cmath>
  6. using namespace std;
  7.  
  8. class stopwatch
  9. {
  10. public:
  11.     chrono::time_point<chrono::high_resolution_clock> a, b;
  12.     void start() { a = chrono::high_resolution_clock::now(); }
  13.     void stop() { b = chrono::high_resolution_clock::now(); }
  14.     double duration()
  15.     {
  16.         chrono::duration<double> elapsed_seconds = b - a;
  17.         return elapsed_seconds.count();
  18.     }
  19. };
  20.  
  21. void fill(vector<string> &list, int size, double d) {
  22.     list.clear();
  23.     list.resize(size);
  24.     for (int i=0; i<size; i++)
  25.         list[i] = to_string(rand()%((int)(size*d)));
  26. }
  27.  
  28. int main() {
  29.     int size = 1024;
  30.     double d = 100;
  31.     vector<string> list;
  32.     do{
  33.         fill(list, size, d);
  34.         stopwatch st;
  35.         st.start();
  36.         sort(list.begin(), list.end());
  37.         st.stop();
  38.         cout << st.duration()*1e9/ log2(size)/size << endl;
  39.         size *= 2;
  40.     }while(size<=4*1024*1024);
  41.     cout << "end" <<endl;
  42.     return 0;
  43. }
  44.  
d=100

16.7077
15.6237
16.1659
14.5625
13.7777
15.116
13.3096
13.6631
13.0391
12.9087
12.9574
12.8193
12.6872
end

d=0.01, even faster!

8.36992
10.4783
10.5804
10.4236
9.90457
9.99056
9.80442
9.87245
9.76277
9.91909
9.97854
10.568
10.2577
end


FPC is several times slower, but fatal is in form application.
I make thread , but it is not important, the same is in main thread, except that thread allows convenient view results, when process not yet finished
Code: Pascal  [Select][+][-]
  1. program timeSort;
  2.  
  3. {$mode objfpc}{$H+}
  4. {$define UseCThreads}
  5.  
  6. uses
  7.   {$IFDEF UNIX}{$IFDEF UseCThreads}
  8.   cthreads,cmem,
  9.   {$ENDIF}{$ENDIF}
  10.   Interfaces, // this includes the LCL widgetset
  11.   Forms, Unit1
  12.   { you can add units after this };
  13.  
  14. {$R *.res}
  15.  
  16. begin
  17.   RequireDerivedFormResource:=True;
  18.   Application.Scaled:=True;
  19.   Application.Initialize;
  20.   Application.CreateForm(TForm1, Form1);
  21.   Application.Run;
  22. end.
  23.  
and
Code: Text  [Select][+][-]
  1. object Form1: TForm1
  2.   Left = 248
  3.   Height = 240
  4.   Top = 131
  5.   Width = 320
  6.   Caption = 'Form1'
  7.   ClientHeight = 240
  8.   ClientWidth = 320
  9.   OnCreate = FormCreate
  10.   LCLVersion = '2.0.12.0'
  11.   object Button1: TButton
  12.     Left = 76
  13.     Height = 25
  14.     Top = 22
  15.     Width = 75
  16.     Caption = 'Button1'
  17.     OnClick = Button1Click
  18.     TabOrder = 0
  19.   end
  20.   object Memo1: TMemo
  21.     Left = 16
  22.     Height = 130
  23.     Top = 96
  24.     Width = 296
  25.     Lines.Strings = (
  26.       'Memo1'
  27.     )
  28.     ScrollBars = ssAutoBoth
  29.     TabOrder = 1
  30.   end
  31. end
  32.  
and
Code: Pascal  [Select][+][-]
  1. unit Unit1;
  2.  
  3. {$mode objfpc}{$H+}
  4.  
  5. interface
  6.  
  7. uses
  8.   Classes, SysUtils, Forms, Controls, Graphics, Dialogs, StdCtrls,Math;
  9.  
  10. type
  11.  
  12.   { TForm1 }
  13.  
  14.   TForm1 = class(TForm)
  15.     Button1: TButton;
  16.     Memo1: TMemo;
  17.     procedure Button1Click(Sender: TObject);
  18.   private
  19.   public
  20.   end;
  21.  
  22. var
  23.   Form1: TForm1;
  24.  
  25. implementation
  26.  
  27. uses Linux, unixtype;
  28.  
  29. {$R *.lfm}
  30.  
  31. type
  32.  
  33.   { TTestThread }
  34.  
  35.   TTestThread = class(TThread)
  36.   protected
  37.     fToMemo: string;
  38.     procedure AddToMemo;
  39.     procedure Execute; override;
  40.   public
  41.     constructor Create();
  42.   end;
  43.  
  44. { TTestThread }
  45.  
  46. procedure fill(list: TStringList; size: integer; d: double);
  47. var
  48.   i,n: integer;
  49. begin
  50.   list.Clear;
  51.   for i:=0 to size-1 do
  52.   begin
  53.     n:=Random(round(size*d));
  54.     list.Add(IntToStr(n));
  55.   end;
  56. end;
  57.  
  58. procedure TTestThread.Execute;
  59. const c1e9 = 1000*1000*1000;
  60. var
  61.   tp1,tp2: ttimespec;
  62.   t1,t2: int64;
  63.   list:TStringList;
  64.   size: integer;
  65.   d: double;
  66. begin
  67.   list:=TStringList.Create;
  68.   size:=1024;
  69.   d:=100;
  70.   repeat
  71.      Fill(list, size, d);
  72.      clock_gettime(CLOCK_MONOTONIC, @tp1);
  73.      list.sort();
  74.      clock_gettime(CLOCK_MONOTONIC, @tp2);
  75.      t1:=tp1.tv_nsec+tp1.tv_sec*c1e9;
  76.      t2:=tp2.tv_nsec+tp2.tv_sec*c1e9;
  77.      fToMemo := FloatToStr((t2-t1)/Math.Log2(size)/size);
  78.      synchronize(@AddTOMemo);
  79.      size:=size*2;
  80.   until size>4*1024*1024;
  81.   list.Free;
  82.   fToMemo := 'end';
  83.   synchronize(@AddTOMemo);
  84. end;
  85.  
  86. constructor TTestThread.Create();
  87. begin
  88.   inherited Create(false);
  89.   FreeOnTerminate:=true;
  90. end;
  91.  
  92. procedure TTestThread.AddToMemo;
  93. begin
  94.   Form1.Memo1.Lines.Add(fToMemo);
  95. end;
  96.  
  97. { TForm1 }
  98.  
  99. procedure TForm1.Button1Click(Sender: TObject);
  100. var
  101.   thread: TTestThread;
  102. begin
  103.   thread:=TTestThread.Create();
  104.   thread.Start;
  105. end;
  106.  
  107. end.
  108.  
Results for d=100 (aggressive optimization)

1955.9984375
1255.27476917614
505.58701578776
459.529766376202
486.121451241629
512.559214274089
517.353302955627
561.423980712891
604.507530212402
642.564016342163
644.596619796753
697.545514220283
716.581226251342
end

and for d=0.01

3564.07783203125
1221.64275568182
1036.1786702474
995.541672926683
1013.31421334403
1049.10948689779
1012.10831642151
1034.92465344597
1054.99673144023
1066.52670609324
1088.22270264626
1120.13203434717
1127.15126429905
end

Horrible slow!
So worst, BeginUpdate/EndUpdate
Code: Pascal  [Select][+][-]
  1.      list.BeginUpdate;
  2.      list.sort();
  3.      list.EndUpdate;
  4.  
not helps....

avk

  • Hero Member
  • *****
  • Posts: 616
    • my self-education project
Re: Slow TStringList.Sort in Applications.
« Reply #1 on: May 02, 2022, 04:41:19 pm »
Perhaps one reason could be that TStringList uses AnsiCompareText() as its default comparator, which, if I'm not mistaken, calls Utf8CompareText() under the hood in LCL applications.
What happens if you add
Code: Pascal  [Select][+][-]
  1. begin
  2.   list:=TStringList.Create;
  3.   list.UseLocale := False; // <---
  4.   ...
  5.  

Borneq

  • Full Member
  • ***
  • Posts: 248
Re: Slow TStringList.Sort in Applications.
« Reply #2 on: May 02, 2022, 07:38:46 pm »
Nothing happen in console program, but very speedup for Application  - now is time as console. Thanks.

paweld

  • Sr. Member
  • ****
  • Posts: 456
Re: Slow TStringList.Sort in Applications.
« Reply #3 on: May 02, 2022, 09:35:10 pm »
use generics, eg. Generics.Collections or LGenerics ( https://github.com/avk959/LGenerics )
Best regards
paweld

Warfley

  • Hero Member
  • *****
  • Posts: 914
Re: Slow TStringList.Sort in Applications.
« Reply #4 on: May 04, 2022, 08:43:08 pm »
The difference between FPC and C++ is to be expected. The major difference here is that the FPC version uses function pointers for both comparison, but also swapping. C++ on the other hand takes the comparator as template parameter and does the swapping using std::move. Both of these calls can be inlined.

For example:
Code: Pascal  [Select][+][-]
  1. program Project1;
  2.  
  3. {$mode objfpc}{$H+}
  4.  
  5. uses
  6.   SysUtils;
  7.  
  8. type
  9.   TIntArray = Array of Integer;
  10.   TLessFunc = function(A, B: Integer): Boolean;
  11.   TSwapFunc = procedure(var A, B: Integer);
  12.  
  13. function Less(A, B: Integer): Boolean; inline;
  14. begin
  15.   Result := A < B;
  16. end;
  17.  
  18. procedure Swap(var A, B: Integer); inline;
  19. begin
  20.   A := A Xor B;
  21.   B := B Xor A;
  22.   A := A Xor B;
  23. end;
  24.  
  25. procedure BubbleSortFP(data: TIntArray; LessFunc: TLessFunc; SwapFunc: TSwapFunc);
  26. var
  27.   i, j: Integer;
  28. begin
  29.   for i := 1 to Length(data) - 1 do
  30.     for j := 0 to Length(data) - 1 - i do
  31.       if LessFunc(data[j + 1], data[j]) then SwapFunc(data[j], data[j + 1]);
  32. end;
  33.  
  34. procedure BubbleSortInline(data: TIntArray);
  35. var
  36.   i, j: Integer;
  37. begin
  38.   for i := 0 to Length(data) - 1 do
  39.     for j := 0 to Length(data) - 2 - i do
  40.       if Less(data[j + 1], data[j]) then Swap(data[j], data[j + 1]);
  41. end;
  42.  
  43. var
  44.   data: Array of Integer;
  45.   i: Integer;
  46.   Start: QWord;
  47. begin
  48.   SetLength(data, 10000);
  49.  
  50.   for i:=0 to length(data) -1 do
  51.     data[i] := Random(Integer.MaxValue);
  52.   Start := GetTickCount64;
  53.   BubbleSortFP(data, @Less, @Swap);
  54.   WriteLn('FuncPtr: ', GetTickCount64 - Start);
  55.  
  56.   // Refill array with new numbers to "fool" predictive caching of CPU
  57.   for i:=0 to length(data) -1 do
  58.     data[i] := Random(Integer.MaxValue);
  59.   Start := GetTickCount64;
  60.   BubbleSortInline(data);
  61.   WriteLn('Inline: ', GetTickCount64 - Start);
  62.   ReadLn;
  63. end.
On my machine the BubbleSortFP takes around double the time the inlined version BubbleSortInline does, just by passing function pointers.

Generally speaking, the classes in the RTL (or generally most parts of the RTL) are not optimised for performance. Plus generally are most C++ compilers much better at optimisation than the FPC.

PS: what this example also wonderfully demonstrates is the power of predictive caching, remove the second filling of the array with new random values and you will see a major speedup on many modern CPU, on mine its like 4x
« Last Edit: May 04, 2022, 08:52:54 pm by Warfley »

 

TinyPortal © 2005-2018