Recent

Author Topic: efficiency problem  (Read 51492 times)

LV

  • Sr. Member
  • ****
  • Posts: 266
Re: efficiency problem
« Reply #120 on: March 01, 2025, 08:25:43 pm »
So, anyway, AlgLib's 108 ms with a single thread is not bad. But the multi-threaded version of AlgLib is not free, unfortunately.

This evening, I had some free time and decided to parallelize the free version of the AlgLib library (see the attached code). Below are the results of comparative calculations, with an average derived from three program runs on a desktop with an Intel i7 8700 processor, which has 6 cores and 12 threads.

Code: Text  [Select][+][-]
  1. Code I (one-thrеaded) from Reply #89               966 ms
  2. Code II (one-thrеaded) from Reply #103             363 ms
  3. Code III (Code II four-thrеaded) from Reply #116    104 ms
  4. Code IV (AlgLib one-thrеaded) from Reply #110      107 ms
  5. Code V (AlgLib four-thrеaded) see attached code            53  ms
  6. NumPy (twelve-threaded?)                                            15  ms
  7.  

Code: Pascal  [Select][+][-]
  1. program MatrixMultiplicationALGLIB;
  2.  
  3. {$mode objfpc}{$H+}
  4.  
  5. uses
  6.   SysUtils,
  7.   Math,
  8.   DateUtils,
  9.   Classes,
  10.   xalglib;
  11.  
  12. const
  13.   N = 1000;
  14.  
  15. type
  16.   TMatrixMultiplier = class(TThread)
  17.   public
  18.     A, B, C: xalglib.TMatrix;
  19.     OffsetA, OffsetB, OffsetC: Integer;
  20.     Rows, Cols: Integer;
  21.     procedure Execute; override;
  22.   end;
  23.  
  24. var
  25.   A, B, C, CBase: xalglib.TMatrix;
  26.   A1, A2, B1, B2: xalglib.TMatrix;
  27.   C11, C12, C21, C22: xalglib.TMatrix;
  28.   i, j, halfN, restN: integer;
  29.   StartTime, EndTime: TDateTime;
  30.   Threads: array[0..3] of TThread;
  31.  
  32. procedure TMatrixMultiplier.Execute;
  33. begin
  34.   try
  35.     if (Length(A) > 0) and (Length(B) > 0) then
  36.       RMatrixGEMM(Rows, Cols, N, 1.0,
  37.         A, OffsetA, 0, 0,
  38.         B, 0, OffsetB, 0,
  39.         0.0, C, OffsetC, 0);
  40.   except
  41.     on E: Exception do
  42.       Writeln('Thread error: ', E.ClassName, ': ', E.Message);
  43.   end;
  44. end;
  45.  
  46. procedure InitializeAlignedMatrix(var M: xalglib.TMatrix; Rows, Cols: Integer);
  47. var
  48.   i: Integer;
  49. begin
  50.   SetLength(M, Rows);
  51.   for i := 0 to Rows - 1 do
  52.   begin
  53.     SetLength(M[i], Cols);
  54.     FillChar(M[i][0], Cols * SizeOf(Double), 0);
  55.   end;
  56. end;
  57.  
  58. procedure GenerateRandomMatrix(var M: xalglib.TMatrix; Rows, Cols: Integer);
  59. var
  60.   i, j: Integer;
  61. begin
  62.   InitializeAlignedMatrix(M, Rows, Cols);
  63.   for i := 0 to Rows - 1 do
  64.     for j := 0 to Cols - 1 do
  65.       M[i][j] := Random(10);
  66. end;
  67.  
  68. function MatricesEqual(const Mat1, Mat2: xalglib.TMatrix): Boolean;
  69. var
  70.   i, j: Integer;
  71. begin
  72.   Result := True;
  73.   for i := 0 to High(Mat1) do
  74.     for j := 0 to High(Mat1[i]) do
  75.       if not SameValue(Mat1[i][j], Mat2[i][j], 1e-9) then
  76.         Exit(False);
  77. end;
  78.  
  79. begin
  80.   try
  81.     // Initialize matrices with alignment
  82.     GenerateRandomMatrix(A, N, N);
  83.     GenerateRandomMatrix(B, N, N);
  84.     InitializeAlignedMatrix(C, N, N);
  85.     InitializeAlignedMatrix(CBase, N, N);
  86.  
  87.     // Standard multiplication
  88.     StartTime := Now;
  89.     RMatrixGEMM(N, N, N, 1.0, A, 0, 0, 0, B, 0, 0, 0, 0.0, C, 0, 0);
  90.     EndTime := Now;
  91.     Writeln('Standard: ', MilliSecondsBetween(EndTime, StartTime), ' ms');
  92.  
  93.     // Parallel block multiplication
  94.     StartTime := Now;
  95.     halfN := N div 2;
  96.     restN := N - halfN;
  97.  
  98.     // Split matrix A
  99.     SetLength(A1, halfN);
  100.     SetLength(A2, restN);
  101.     for i := 0 to halfN - 1 do
  102.     begin
  103.       A1[i] := Copy(A[i], 0, N);
  104.       A2[i] := Copy(A[i + halfN], 0, N);
  105.     end;
  106.  
  107.     // Split matrix B
  108.     SetLength(B1, N, halfN);
  109.     SetLength(B2, N, halfN);
  110.     for i := 0 to N - 1 do
  111.     begin
  112.       B1[i] := Copy(B[i], 0, halfN);
  113.       B2[i] := Copy(B[i], halfN, halfN);
  114.     end;
  115.  
  116.     // Initialize result matrices
  117.     InitializeAlignedMatrix(C11, halfN, halfN);
  118.     InitializeAlignedMatrix(C12, halfN, restN);
  119.     InitializeAlignedMatrix(C21, restN, halfN);
  120.     InitializeAlignedMatrix(C22, restN, restN);
  121.  
  122.     // Configure threads
  123.     Threads[0] := TMatrixMultiplier.Create(True);
  124.     with TMatrixMultiplier(Threads[0]) do begin
  125.       A := A1; B := B1; C := C11;
  126.       OffsetA := 0; OffsetB := 0; OffsetC := 0;
  127.       Rows := halfN; Cols := halfN;
  128.       FreeOnTerminate := False;
  129.     end;
  130.  
  131.     Threads[1] := TMatrixMultiplier.Create(True);
  132.     with TMatrixMultiplier(Threads[1]) do begin
  133.       A := A2; B := B2; C := C22;
  134.       OffsetA := 0; OffsetB := 0; OffsetC := 0;
  135.       Rows := restN; Cols := restN;
  136.       FreeOnTerminate := False;
  137.     end;
  138.  
  139.     Threads[2] := TMatrixMultiplier.Create(True);
  140.     with TMatrixMultiplier(Threads[2]) do begin
  141.       A := A1; B := B2; C := C12;
  142.       OffsetA := 0; OffsetB := 0; OffsetC := 0;
  143.       Rows := halfN; Cols := restN;
  144.       FreeOnTerminate := False;
  145.     end;
  146.  
  147.     Threads[3] := TMatrixMultiplier.Create(True);
  148.     with TMatrixMultiplier(Threads[3]) do begin
  149.       A := A2; B := B1; C := C21;
  150.       OffsetA := 0; OffsetB := 0; OffsetC := 0;
  151.       Rows := restN; Cols := halfN;
  152.       FreeOnTerminate := False;
  153.     end;
  154.  
  155.     // Start threads
  156.     for i := 0 to 3 do Threads[i].Start;
  157.     for i := 0 to 3 do Threads[i].WaitFor;
  158.  
  159.     // Assemble results
  160.     for i := 0 to halfN - 1 do
  161.     begin
  162.       if (halfN > 0) and (Length(C11[i]) = halfN) then
  163.         Move(C11[i][0], C[i][0], halfN * SizeOf(Double));
  164.       if (restN > 0) and (Length(C12[i]) = restN) then
  165.         Move(C12[i][0], C[i][halfN], restN * SizeOf(Double));
  166.     end;
  167.  
  168.     for i := 0 to restN - 1 do
  169.     begin
  170.       if (halfN > 0) and (Length(C21[i]) = halfN) then
  171.         Move(C21[i][0], C[i + halfN][0], halfN * SizeOf(Double));
  172.       if (restN > 0) and (Length(C22[i]) = restN) then
  173.         Move(C22[i][0], C[i + halfN][halfN], restN * SizeOf(Double));
  174.     end;
  175.  
  176.     EndTime := Now;
  177.     Writeln('Parallel: ', MilliSecondsBetween(EndTime, StartTime), ' ms');
  178.  
  179.     // Validation
  180.     StartTime := Now;
  181.     RMatrixGEMM(N, N, N, 1.0, A, 0, 0, 0, B, 0, 0, 0, 0.0, CBase, 0, 0);
  182.     EndTime := Now;
  183.     WriteLn('Validation: ', MatricesEqual(C, CBase));
  184.  
  185.     // Cleanup
  186.     for i := 0 to 3 do Threads[i].Free;
  187.  
  188.   except
  189.     on E: Exception do
  190.       Writeln('Main error: ', E.ClassName, ': ', E.Message);
  191.   end;
  192.  
  193.   Readln;
  194. end.
  195.  

Paolo

  • Hero Member
  • *****
  • Posts: 584
Re: efficiency problem
« Reply #121 on: March 01, 2025, 10:22:23 pm »
Hello,

please try this instead of code replay #89 and tell me if things improve

Code: Pascal  [Select][+][-]
  1. const
  2.   gN = 1000;
  3.  
  4. type
  5.   TVector = array [0..gN-1] of double;
  6.   TMatrix = array [0..gN-1] of TVector;
  7.   PTVector = ^TVector;
  8.  
  9. ...
  10. ...
  11.  
  12. procedure TForm1.IKJ_VCT(var AMat1, AMat2, AMat3: TMatrix);
  13. var
  14.   i, j, k : integer;
  15.   A_ik : double;
  16.   PVct1, PVct2, PVct3 : PTVector;
  17. begin
  18.   for i:=0 to gN-1 do begin
  19.     PVct1:=@AMat1[i];
  20.     PVct3:=@AMat3[i];
  21.     for k:=0 to gN-1 do begin
  22.       A_ik:=PVct1^[k];
  23.       PVct2:=@AMat2[k];
  24.       for j:=0 to gN-1 do
  25.         PVct3^[j]:=PVct3^[j]+A_ik*PVct2^[j]
  26.     end
  27.   end
  28. end;
  29.  
  30.  


LV

  • Sr. Member
  • ****
  • Posts: 266
Re: efficiency problem
« Reply #122 on: March 02, 2025, 08:46:41 am »
Hi @Paolo! Your suggestion improves performance by approximately 30% compared to the code in reply #89 (see attached code).

Code: Pascal  [Select][+][-]
  1. program MatrixMultiplication;
  2.  
  3. {$mode objfpc}{$H+}
  4.  
  5. uses
  6.   SysUtils, Math, DateUtils;
  7.  
  8. const
  9.   gN = 1000;
  10.  
  11. type
  12.   TVector = array[0..gN-1] of Double;
  13.   TMatrix = array[0..gN-1] of TVector;
  14.   PTVector = ^TVector;
  15.  
  16. // Matrix initialization with random values
  17. procedure RandomizeMatrix(var M: TMatrix);
  18. var
  19.   i, j: Integer;
  20. begin
  21.   Randomize;
  22.   for i := 0 to gN - 1 do
  23.     for j := 0 to gN - 1 do
  24.       M[i][j] := Random * 1000;
  25. end;
  26.  
  27. // Optimized multiplication (IKJ order)
  28. procedure IKJ_VCT(var AMat1, AMat2, AMat3: TMatrix);
  29. var
  30.   i, j, k: Integer;
  31.   A_ik: Double;
  32.   PVct1, PVct2, PVct3: PTVector;
  33. begin
  34.   for i := 0 to gN - 1 do
  35.   begin
  36.     PVct1 := @AMat1[i];
  37.     PVct3 := @AMat3[i];
  38.     for k := 0 to gN - 1 do
  39.     begin
  40.       A_ik := PVct1^[k];
  41.       PVct2 := @AMat2[k];
  42.       for j := 0 to gN - 1 do
  43.         PVct3^[j] := PVct3^[j] + A_ik * PVct2^[j];
  44.     end;
  45.   end;
  46. end;
  47.  
  48. // Matrix multiplication (modified version)
  49. procedure ModifiedMultiply(const mat1, mat2: TMatrix; var res: TMatrix);
  50. var
  51.   i, j, k: Integer;
  52.   sum: Double;
  53.   v: TVector;
  54. begin
  55.   FillChar(res, SizeOf(res), 0);
  56.   for j := 0 to gN - 1 do
  57.   begin
  58.     for k := 0 to gN - 1 do
  59.       v[k] := mat2[k][j];
  60.     for i := 0 to gN - 1 do
  61.     begin
  62.       sum := 0;
  63.       for k := 0 to gN - 1 do
  64.         sum := sum + mat1[i][k] * v[k];
  65.       res[i][j] := sum;
  66.     end;
  67.   end;
  68. end;
  69.  
  70. // Matrix comparison
  71. function IsSame(const mat1, mat2: TMatrix): Boolean;
  72. var
  73.   i, k: NativeUInt;
  74. begin
  75.   for i := 0 to gN - 1 do
  76.     for k := 0 to gN - 1 do
  77.       if not SameValue(mat1[i][k], mat2[i][k]) then
  78.         Exit(False);
  79.   Result := True;
  80. end;
  81.  
  82. var
  83.   Mat1, Mat2, MatRes, MatExpected: TMatrix;
  84.   startTime: TDateTime;
  85.   elapsedOpt, elapsedModified: Int64;
  86.   i, j: Integer;
  87. begin
  88.   // Initialize matrices
  89.   RandomizeMatrix(Mat1);
  90.   RandomizeMatrix(Mat2);
  91.  
  92.   // Clear result matrices
  93.   FillChar(MatRes, SizeOf(MatRes), 0);
  94.   FillChar(MatExpected, SizeOf(MatExpected), 0);
  95.  
  96.   // Test optimized version
  97.   startTime := Now;
  98.   IKJ_VCT(Mat1, Mat2, MatRes);
  99.   elapsedOpt := MilliSecondsBetween(startTime, Now);
  100.  
  101.   // Test modified version
  102.   startTime := Now;
  103.   ModifiedMultiply(Mat1, Mat2, MatExpected);
  104.   elapsedModified := MilliSecondsBetween(startTime, Now);
  105.  
  106.   // Verify results
  107.   WriteLn('Verification: ', IsSame(MatRes, MatExpected));
  108.   WriteLn('Optimized time: ', elapsedOpt, ' ms');
  109.   WriteLn('Modified time: ', elapsedModified, ' ms');
  110.   WriteLn('Speed ratio: ', (elapsedModified/elapsedOpt):0:1, 'x');
  111.   Readln;
  112. end.
  113.  

Performance Comparison Table Relative to NumPy:

Code: Text  [Select][+][-]
  1. Code I (one-thrеaded) from Reply #89                  64.4
  2. Code II (one-thrеaded) from Reply #121                51.0
  3. Code III (one-thrеaded) from Reply #103               24.2
  4. Code IV (AlgLib one-thrеaded) from Reply #110         7.1
  5. Code V (Code III four-thrеaded) from Reply #116       7.0
  6. Code VI (AlgLib four-thrеaded) from Reply #120        3.5
  7. NumPy (twelve-threaded?)                              1.0
  8.  
« Last Edit: March 02, 2025, 09:02:51 am by LV »

photor

  • Jr. Member
  • **
  • Posts: 80
Re: efficiency problem
« Reply #123 on: March 02, 2025, 09:21:31 am »
Performance Comparison Table Relative to NumPy:

Code: Text  [Select][+][-]
  1. Code I (one-thrеaded) from Reply #89                  64.4
  2. Code II (one-thrеaded) from Reply #121                51.0
  3. Code III (one-thrеaded) from Reply #103               24.2
  4. Code IV (AlgLib one-thrеaded) from Reply #110         7.1
  5. Code V (Code III four-thrеaded) from Reply #116       7.0
  6. Code VI (AlgLib four-thrеaded) from Reply #120        3.5
  7. NumPy (twelve-threaded?)                              1.0
  8.  

numpy is 8-threaded by default (on most of modern computers).

updated: On my computer (kind of old, intel i7 4700), numpy is 4-threaded by default using MKL on Win 11.
« Last Edit: March 02, 2025, 12:48:08 pm by photor »

LV

  • Sr. Member
  • ****
  • Posts: 266
Re: efficiency problem
« Reply #124 on: March 02, 2025, 10:33:22 am »
Hi @photor! Note: NumPy on my machine uses the MKL library. I tested performance with one and then four threads using that library. Here are the results:

Code: Text  [Select][+][-]
  1.                                                     ms
  2. -------------------------------------------------------
  3. Code I (one-thrеaded) from Reply #89                966
  4. Code II (one-thrеaded) from Reply #121              765
  5. Code III (one-thrеaded) from Reply #103             363
  6. Code IV (AlgLib one-thrеaded) from Reply #110       107
  7. Code V (Code III four-thrеaded) from Reply #116     104
  8. Code VI (AlgLib four-thrеaded) from Reply #120      53
  9. NumPy (one-thrеaded MKL)                            37
  10. NumPy (four-thrеaded MKL)                           14
  11. --------------------------------------------------------
  12.  

photor

  • Jr. Member
  • **
  • Posts: 80
Re: efficiency problem
« Reply #125 on: March 02, 2025, 12:52:39 pm »
Hi @photor! Note: NumPy on my machine uses the MKL library. I tested performance with one and then four threads using that library. Here are the results:

Code: Text  [Select][+][-]
  1.                                                     ms
  2. -------------------------------------------------------
  3. Code I (one-thrеaded) from Reply #89                966
  4. Code II (one-thrеaded) from Reply #121              765
  5. Code III (one-thrеaded) from Reply #103             363
  6. Code IV (AlgLib one-thrеaded) from Reply #110       107
  7. Code V (Code III four-thrеaded) from Reply #116     104
  8. Code VI (AlgLib four-thrеaded) from Reply #120      53
  9. NumPy (one-thrеaded MKL)                            37
  10. NumPy (four-thrеaded MKL)                           14
  11. --------------------------------------------------------
  12.  

Thanks. Typically numpy uses MKL or openblas as the backend. Actually openblas is a completely free library written by C, so can free pascal directly call it instead of AlgLib?

LV

  • Sr. Member
  • ****
  • Posts: 266
Re: efficiency problem
« Reply #126 on: March 02, 2025, 01:28:25 pm »
Thanks. Typically numpy uses MKL or openblas as the backend. Actually openblas is a completely free library written by C, so can free pascal directly call it instead of AlgLib?

The short answer is yes. There's no need to "reinvent the wheel" regarding standard algebraic operations. However, implementing non-standard algorithms may require different considerations. Additionally, when it's important to quickly develop a graphical interface. There may also be other reasons to choose FPC and Lazarus for your project. The issue of efficiency is complex and has many dimensions.  :)

Paolo

  • Hero Member
  • *****
  • Posts: 584
Re: efficiency problem
« Reply #127 on: March 11, 2025, 10:18:08 pm »
my last attempt with one single thread, can you check the speed ?

I made the size of array variable (mode = Delphi, no range check)

Code: Pascal  [Select][+][-]
  1.  
  2. type
  3.   T0Arr = array [0..0] of double;
  4.   TP0Arr = ^T0Arr;
  5. ...
  6. procedure TForm1.IKJ_MEM2BIS(var AN: integer; var AM1, AM2, AM3: TP0Arr);
  7. var
  8.   M1ik : double;
  9.   M2k, M3ij, M1i, M3i : ^double;
  10.   i, j, k : integer;
  11.   C : integer;
  12. begin                            
  13.   M1i:=@AM1[0];                  
  14.   for i:=0 to An-1 do begin
  15.     M3i:=@AM3[i*An];
  16.     M2k:=@AM2[0];                
  17.     for k:=0 to An-1 do begin
  18.       M1ik:=M1i^;
  19.       M3ij:=M3i;
  20.       for j:=0 to An-1 do begin
  21.         M3ij^:=M3ij^+M1ik*M2k^;      
  22.         Inc(M2k);
  23.         Inc(M3ij)
  24.       end;
  25.       Inc(M1i)
  26.     end
  27.   end
  28. end;
  29.  
  30.  

usage

Code: Pascal  [Select][+][-]
  1. ..
  2. var
  3.    M1MEM, M2MEM, M3MEM : TP0Arr;
  4.  
  5. ...
  6.   m:=1000;
  7.   GetMem(M1MEM, m*m*8);
  8.   GetMem(M2MEM, m*m*8);
  9.   GetMem(M3MEM, m*m*8);
  10.  
  11.   IKJ_MEM2BIS(m, M1MEM, M2MEM, M3MEM);
  12.  
  13.   FreeMem(M1MEM);
  14.   FreeMem(M2MEM);
  15.   FreeMem(M3MEM);
  16.  

photor

  • Jr. Member
  • **
  • Posts: 80
Re: efficiency problem
« Reply #128 on: March 13, 2025, 03:28:45 am »
my last attempt with one single thread, can you check the speed ?

I made the size of array variable (mode = Delphi, no range check)

Tested, it doesn't help much.
Recently, I had better understanding on high-performance matrix multiplication. I will post something soon, when I have some spare time.

Paolo

  • Hero Member
  • *****
  • Posts: 584
Re: efficiency problem
« Reply #129 on: March 13, 2025, 08:38:46 am »
yes. about 10% better.

in any case can we say that between the two codes below the difference in speed is high, making solution based on dynamic array not really usable !

same code with static array...

Code: Pascal  [Select][+][-]
  1. ..
  2.   TVector = array [0..gN-1] of double;
  3.   TMatrix = array [0..gN-1] of TVector;
  4. ..
  5. procedure TForm1.IJK_STD(var AMat1, AMat2, AMat3: TMatrix);
  6. var
  7.   i, j, k : integer;
  8.   Sum : Double;
  9. begin
  10.   for i:=0 to gN-1 do
  11.     for j:=0 to gN-1 do begin
  12.       Sum:=0;
  13.       for k:=0 to gN-1 do
  14.         Sum:=Sum+AMat1[i,k]*AMat2[k,j];
  15.       AMat3[i,j]:=Sum
  16.     end
  17. end;
  18.  
  19.  

...and with dynamic array

Code: Pascal  [Select][+][-]
  1. ..
  2.   TArrayDouble = array of array of double;
  3. ..
  4. procedure TForm1.IJK_DynArr(var AMat1, AMat2, AMat3: TArrayDouble);
  5. var
  6.   i, j, k : integer;
  7.   Sum : Double;
  8. begin
  9.   for i:=0 to gN-1 do
  10.     for j:=0 to gN-1 do begin
  11.       Sum:=0;
  12.       for k:=0 to gN-1 do
  13.         Sum:=Sum+AMat1[i,k]*AMat2[k,j];
  14.       AMat3[i,j]:=Sum
  15.     end
  16. end;
  17.  

Quote
I will post something soon, when I have some spare time

good to know.

Thaddy

  • Hero Member
  • *****
  • Posts: 16940
  • Ceterum censeo Trump esse delendam
Re: efficiency problem
« Reply #130 on: March 13, 2025, 12:46:29 pm »
Once memory for the arrays is allocated there is no speed difference between static and dynamic arrays as long as the code to access the array elements is exactly the same. Dynamic arrays only have a slight speed penalty at allocation time. Then again, a static array may affect the load time of the executable.
Due to censorship, I changed this to "Nelly the Elephant". Keeps the message clear.

Khrys

  • Full Member
  • ***
  • Posts: 227
Re: efficiency problem
« Reply #131 on: March 13, 2025, 01:30:16 pm »
Once memory for the arrays is allocated there is no speed difference between static and dynamic arrays as long as the code to access the array elements is exactly the same. Dynamic arrays only have a slight speed penalty at allocation time. Then again, a static array may affect the load time of the executable.

Did you even read the code?

This has nothing to do with static/dynamic memory allocation - it's about fixed-size, contiguous, true 2D arrays vs. dynamically-sized arrays of arrays, the latter of which requires an extra pointer dereference and may have terrible memory locality.

Paolo

  • Hero Member
  • *****
  • Posts: 584
Re: efficiency problem
« Reply #132 on: March 13, 2025, 01:30:27 pm »
Hi Thaddy, copy-paste and run the two codes above and let me know what you see in terms of speed.
Here win64  fpc 322 and -o4 optimization.

LV

  • Sr. Member
  • ****
  • Posts: 266
Re: efficiency problem
« Reply #133 on: March 14, 2025, 05:51:11 pm »
As part of the exercise, I:
1. downloaded the libopenblas.dll file from https://sourceforge.net/projects/openblas/.
2. wrote a program.

Code: Pascal  [Select][+][-]
  1. program MatrixTest;
  2.  
  3. uses
  4.   SysUtils,
  5.   CTypes,
  6.   Math;
  7.  
  8. const
  9.   N = 1000;
  10.   EPSILON = 1e-6;
  11.  
  12. var
  13.   A, B, C_Blas, C_Naive: array of double;
  14.   i, j, k: integer;
  15.   start_time, end_time: QWord;
  16.   max_diff: double;
  17.   alpha, beta: double;
  18.   mm, nn, k_val: integer;
  19.  
  20. procedure dgemm(transa: PChar; transb: PChar; m: PCInt; n: PCInt;
  21.   k: PCInt; alpha: PDouble; A: PDouble; lda: PCInt; B: PDouble;
  22.   ldb: PCInt; beta: PDouble; C: PDouble; ldc: PCInt); cdecl;
  23.   external 'libopenblas' Name 'dgemm_';
  24.  
  25.   procedure NaiveMatrixMultiplyColMajor;
  26.   var
  27.     sum: double;
  28.   begin
  29.     for i := 0 to N - 1 do
  30.       for j := 0 to N - 1 do
  31.       begin
  32.         sum := 0.0;
  33.         for k := 0 to N - 1 do
  34.           sum := sum + A[i + k * N] * B[k + j * N];
  35.         C_Naive[i + j * N] := sum;
  36.       end;
  37.   end;
  38.  
  39. begin
  40.   Randomize;
  41.  
  42.   SetLength(A, N * N);
  43.   SetLength(B, N * N);
  44.   SetLength(C_Blas, N * N);
  45.   SetLength(C_Naive, N * N);
  46.  
  47.  
  48.   for i := 0 to N - 1 do
  49.     for j := 0 to N - 1 do
  50.     begin
  51.       A[i + j * N] := Random;
  52.       B[i + j * N] := Random;
  53.     end;
  54.  
  55.   mm := N;
  56.   nn := N;
  57.   k_val := N;
  58.   alpha := 1.0;
  59.   beta := 0.0;
  60.  
  61.  
  62.   start_time := GetTickCount64;
  63.   dgemm('N', 'N', @mm, @nn, @k_val, @alpha, @A[0], @mm, @B[0], @k_val,
  64.         @beta, @C_Blas[0], @mm);
  65.   end_time := GetTickCount64;
  66.   Writeln('OpenBLAS time: ', (end_time - start_time), ' ms');
  67.  
  68.   start_time := GetTickCount64;
  69.   NaiveMatrixMultiplyColMajor;
  70.   end_time := GetTickCount64;
  71.   Writeln('Naive time:    ', (end_time - start_time), ' ms');
  72.  
  73.   max_diff := 0.0;
  74.   for i := 0 to N * N - 1 do
  75.     max_diff := Max(max_diff, Abs(C_Naive[i] - C_Blas[i]));
  76.  
  77.   Writeln('Max difference: ', max_diff: 0: 10);
  78.   if max_diff < EPSILON then
  79.     Writeln('Results are consistent!')
  80.   else
  81.     Writeln('Results differ!');
  82.  
  83.   readln;
  84. end.
  85.  

Output (i7 8700; windows 11; fpc 3.2.2):

Code: [Select]
OpenBLAS time: 16 ms
Naive time:    2140 ms
Max difference: 0.0000000000
Results are consistent!

It appears that I am approaching the performance limit (16 ms), at least on my machine.  ;)
« Last Edit: March 14, 2025, 05:53:24 pm by LV »

ALLIGATOR

  • Full Member
  • ***
  • Posts: 155
Re: efficiency problem
« Reply #134 on: March 14, 2025, 06:13:44 pm »
Code: [Select]
OpenBLAS time: 16 ms
Naive time:    2140 ms
Max difference: 0.0000000000
Results are consistent!
It appears that I am approaching the performance limit (16 ms), at least on my machine.  ;)

🤖🆒🎉

 

TinyPortal © 2005-2018