Unfortunately i cannot improve the internal cpu routines.

You're smooth procedure is doing a lot of multiplications ~350kk for this test, so i thought it was better to avoid the multiplications and do some precalculations. Here's my efforts on a (slow) 2400mhz P4 ;-)

Delphi7 newsmooth/oldaligned=4125/4391 ~6% speed increase)

Lazarus newsmooth/oldaligned=4094/4703 ~13% speed increase)

Have fun,

Marius

procedure SmoothInput(lFWHM: integer);

{$ALIGN 8}

var i, lcutoffvoxx, lY, lX, lMin, lMax, lPos, lYPos: integer;

lDataBuffer: array of array of single;

lsigma, lexpd, lcumgauss: single;

lTempBuff: array of byte;

lxra: array of single;

pbyte1: pbyte;

begin

//Calculate static data

lsigma :=(lFWHM) / sqrt(8 * ln(2));

lcutoffvoxx := round(6 * lsigma);

lexpd := 2 * lsigma * lsigma;

Setlength(lTempBuff, gSrcWid * gSrcHt);

//Calculate lxra tables

SetLength(lxra, lcutoffvoxx + 1);

lCumGauss := 0;

for i := 0 to lcutoffvoxx do begin

lxra

* := exp( - 1 *(i * i) / lexpd);*

lCumGauss := lCumGauss + lxra*;*

end;

lCumGauss := 2 * lCumGauss - lxra[0];

if lCumGauss <> 0 then begin

for i := 0 to lcutoffvoxx do begin

lxra* := lxra** / lCumGauss;*

end;

end;

//Precalculate to avoid multiplications in inner loop (reduce it to a sum)

//Dynamic array's are really suprisingly efficient (and clearer to read )

SetLength(lDataBuffer, 256, lcutoffvoxx + 1);

for lx := 0 to 255 do begin

for ly := 0 to lcutoffvoxx do begin

lDataBuffer[lx, ly] := lx * lxra[ly];

end;

end;

//Smooth horizontally

lyPos := 0;

for lY := 0 to gSrcHt - 1 do begin

for lX := 0 to gSrcWid - 1 do begin

lMin := lX - lCutoffVoxX;

if lMin < 0

then lMin := 0;

lMax := lX + lCutoffVoxX;

if lMax >= gSrcWid

then lMax := gSrcWid - 1;

lCumGauss := 0;

pbyte1 := @gBuff^[lYPos + lMin];

for lPos := lMin to lMax do begin

lCumGauss := lCumGauss + lDataBuffer[pbyte1^, abs(lX - lPos)];

inc(pByte1);

end;

lTempBuff[lX + lYPos] := round(lCumGauss);

end;

inc(lyPos, gSrcWid);

end;

//Smooth vertically

for lX := 0 to gSrcWid - 1 do begin

lyPos := 0;

for lY := 0 to gSrcHt - 1 do begin

lMin := lY - lCutoffVoxX;

if lMin < 0

then lMin := 0;

lMax := lY + lCutoffVoxX;

if lMax >= gSrcHt

then lMax := gSrcHt - 1;

lCumGauss := 0;

pbyte1 := @lTempBuff[(lMin * gSrcWid) + lX];

for lPos := lMin to lMax do begin

lCumGauss := lCumGauss + lDataBuffer[pbyte1^, abs(lY - lPos)];

inc(pbyte1, gSrcWid);

end;

gSmoothBuff^[lYPos + lX] := round(lCumGauss);

inc(lyPos, gSrcWid);

end;

end;

end;