Unfortunately i cannot improve the internal cpu routines.
You're smooth procedure is doing a lot of multiplications ~350kk for this test, so i thought it was better to avoid the multiplications and do some precalculations. Here's my efforts on a (slow) 2400mhz P4 ;-)
Delphi7 newsmooth/oldaligned=4125/4391 ~6% speed increase)
Lazarus newsmooth/oldaligned=4094/4703 ~13% speed increase)
Have fun,
Marius
procedure SmoothInput(lFWHM: integer);
{$ALIGN 8}
var i, lcutoffvoxx, lY, lX, lMin, lMax, lPos, lYPos: integer;
lDataBuffer: array of array of single;
lsigma, lexpd, lcumgauss: single;
lTempBuff: array of byte;
lxra: array of single;
pbyte1: pbyte;
begin
//Calculate static data
lsigma :=(lFWHM) / sqrt(8 * ln(2));
lcutoffvoxx := round(6 * lsigma);
lexpd := 2 * lsigma * lsigma;
Setlength(lTempBuff, gSrcWid * gSrcHt);
//Calculate lxra tables
SetLength(lxra, lcutoffvoxx + 1);
lCumGauss := 0;
for i := 0 to lcutoffvoxx do begin
lxra
:= exp( - 1 *(i * i) / lexpd);
lCumGauss := lCumGauss + lxra;
end;
lCumGauss := 2 * lCumGauss - lxra[0];
if lCumGauss <> 0 then begin
for i := 0 to lcutoffvoxx do begin
lxra := lxra / lCumGauss;
end;
end;
//Precalculate to avoid multiplications in inner loop (reduce it to a sum)
//Dynamic array's are really suprisingly efficient (and clearer to read )
SetLength(lDataBuffer, 256, lcutoffvoxx + 1);
for lx := 0 to 255 do begin
for ly := 0 to lcutoffvoxx do begin
lDataBuffer[lx, ly] := lx * lxra[ly];
end;
end;
//Smooth horizontally
lyPos := 0;
for lY := 0 to gSrcHt - 1 do begin
for lX := 0 to gSrcWid - 1 do begin
lMin := lX - lCutoffVoxX;
if lMin < 0
then lMin := 0;
lMax := lX + lCutoffVoxX;
if lMax >= gSrcWid
then lMax := gSrcWid - 1;
lCumGauss := 0;
pbyte1 := @gBuff^[lYPos + lMin];
for lPos := lMin to lMax do begin
lCumGauss := lCumGauss + lDataBuffer[pbyte1^, abs(lX - lPos)];
inc(pByte1);
end;
lTempBuff[lX + lYPos] := round(lCumGauss);
end;
inc(lyPos, gSrcWid);
end;
//Smooth vertically
for lX := 0 to gSrcWid - 1 do begin
lyPos := 0;
for lY := 0 to gSrcHt - 1 do begin
lMin := lY - lCutoffVoxX;
if lMin < 0
then lMin := 0;
lMax := lY + lCutoffVoxX;
if lMax >= gSrcHt
then lMax := gSrcHt - 1;
lCumGauss := 0;
pbyte1 := @lTempBuff[(lMin * gSrcWid) + lX];
for lPos := lMin to lMax do begin
lCumGauss := lCumGauss + lDataBuffer[pbyte1^, abs(lY - lPos)];
inc(pbyte1, gSrcWid);
end;
gSmoothBuff^[lYPos + lX] := round(lCumGauss);
inc(lyPos, gSrcWid);
end;
end;
end;