Here is my experiment using SSE2 code:
program MatMulSSE2;
uses
sysutils;
type
valuetype = double; { should not be changed }
{$R-}{$Q-}
const
n = 1000;
var
M1,M2,TM2,M3: array[1..n,1..n] of valuetype;
row,col,k: integer;
T: TDateTime;
sum: valuetype;
sums:array[1..2] of valuetype;
gsum: valuetype;
pRow, pCol: pointer;
label
again;
begin
Randomize;
for row := 1 to n do
for col := 1 to n do begin
{ example 1 } //{
M1[row,col] := row; { or col }
M2[row,col] := 1;//}
{ example 2 } {
M1[row,col] := Random*50;
M2[row,col] := Random*50;//}
{ example 3 } {
if row = col then
begin
M1[row,col] := 1;
M2[row,col] := 1;
end
else
begin
M1[row,col] := 0;
M2[row,col] := 0;
end;//}
{ Transposition of the second matrix }
TM2[col, row] := M2[row,col];
end;
T := now;
gsum := 0;
for row := 1 to n do //row from 1st matrix
for col := 1 to n do //col from 2nd matrix
begin
{ original code in Pascal --> to use remove } {
sum := 0;
for k:=1 to n do //col from 1st / row from 2nd
{ using matrix M2 } {
sum:=sum + M1[row,k]*M2[k,col]; //}
{ using transposed matrix TM2 }
sum:=sum + M1[row,k]*TM2[col,k];
M3[row, col] := sum; //}
{ SSE2 code } //{
{ SSE2 handles two double values at the same time }
sums[1] := 0; sums[2] := 0;
pRow := @M1[row, 1];
pCol := @TM2[col, 1];
{$ASMMODE intel}
asm
{ loop counter }
mov ecx, n/2
{ sum := 0 }
movupd xmm2, sums
{ @M1[Row, 1] }
mov eax, pRow
{ @M2[1, Col] }
mov ebx, pCol
again:
{ xmm0 := M1[Row, k .. k+1] }
movapd xmm0, [eax]
//movupd xmm0, [eax]
{ xmm1 := TM2[Col, k .. k+1] }
movapd xmm1, [ebx]
{ xmm0 := M1[Row, k] * M2[k, Col] , M1[Row, k+1] * M2[k+1, Col] }
mulpd xmm0, xmm1
{ xmm2 := xmm2 + xmm0 }
addpd xmm2, xmm0
{ point at next pair of columns from M1[Row] }
add eax, 16 { 2 * size of valuetype }
{ point at next pair of rows from TM2[Col] }
add ebx, 16
dec ecx
jnz again
{ following steps are equivelent to sums[1] := sums[1] + sums[2] }
{ xmm3 := xmm2 }
movupd xmm3, xmm2
{ copy double value from higher position to lower position in xmm3 }
unpckhpd xmm3, xmm3
{ add the two numbers: sums[1] := sums[1] + sums[2] }
addpd xmm2, xmm3
movupd sums, xmm2
end ['eax', 'ebx', 'ecx', 'xmm0', 'xmm1', 'xmm2', 'xmm3'];
M3[row, col] := sums[1];// + sum[2];//}
gsum := gsum + M3[row, col];
end;
WriteLn('Grand sum of all the numbers in the result matrix: ', gsum);
WriteLn('Time: ', round((now-T)*24*60*60*1000),' ms');
WriteLn('');
WriteLn('First 10 rows and 3 columns of the result matrix:');
for row := 1 to 10 do
begin
for col := 1 to 3 do
Write(M3[row, col], ' ');
WriteLn('');
end;
end.
I adjusted one of the codes posted above. It doubled the speed on my old 32bit system. SSE2 can handle two double precision multiplications at the same time.