I felt like it's appropriate to post in here, instead of making a new thread without any context.
{$asmmode intel}
{$mode objfpc}
Program cpu_CountDigits;
Uses Math,SysUtils,Windows;
Var
_DigitCount : Array[Word] of Byte;
DigitCount : Pointer;
q,w,e : DWord;
f,c,cc : int64;
Results : Array[Word] of Byte;
Function CountDigits(where : Pointer; n : Word) : DWord; inline; Assembler;
Asm
movzx eax,byte [where+n]
End;
Begin
Randomize;
QueryPerformanceFrequency(f);
For q := 0 to 65535 do _DigitCount[q] := length(IntToStr(q));
DigitCount := @_DigitCount[0];
// -----------------------------------------------
QueryPerformanceCounter(c);
for q := 0 to 15257 do
for w := 0 to 65535 do // one billion
e := CountDigits(DigitCount,w);
QueryPerformanceCounter(cc);
writeln(FormatFloat('0.0000',(cc-c)/f)); // in Seconds.
// -----------------------------------------------
QueryPerformanceCounter(c);
for q := 0 to 15257 do
for w := 0 to 65535 do // one billion
e := _DigitCount[w];
QueryPerformanceCounter(cc);
writeln(FormatFloat('0.0000',(cc-c)/f)); // in Seconds.
// -----------------------------------------------
FillChar(Results,sizeof(Results),0);
QueryPerformanceCounter(c);
Asm
mov rax,DigitCount
lea rdx,Results[0]
mov rbx,65535
mov ecx,15257
add rdx,rbx
add rax,rbx
@bigLoop:
neg ebx
@smallLoop:
movzx edi,byte [RAX+ebx]
mov byte [RDX+ebx],edi
add ebx,1
jnz @smallLoop
mov rbx,65535
sub ecx,1
jnz @bigLoop
End;
QueryPerformanceCounter(cc);
writeln(FormatFloat('0.0000',(cc-c)/f)); // in Seconds.
// for q := 0 to 65535 do write(Results[q],' ');
// -----------------------------------------------
FillChar(Results,sizeof(Results),0);
QueryPerformanceCounter(c);
Asm
mov rax,DigitCount
lea rdx,Results[0]
mov rbx,65535
mov ecx,7629
add rax,rbx
add rdx,rbx
@bigLoop:
neg ebx
@smallLoop:
movzx edi,byte [RAX+ebx]
mov byte [RDX+ebx],edi
add ebx,1
jz @breakloop
movzx edi,byte [RAX+ebx]
mov byte [RDX+ebx],edi
add ebx,1
jnz @smallLoop
@breakloop:
mov rbx,65535
sub ecx,1
jnz @bigLoop
End;
QueryPerformanceCounter(cc);
writeln(FormatFloat('0.0000',(cc-c)/f)); // in Seconds.
// for q := 0 to 65535 do write(Results[q],' ');
End.
Results on my G15 Zephyrus with R5900HS and 16gig of 3200 DDR in Dual Channel mode,
for roughly a billion lookups,
in seconds:
0.5174
1.5414
0.2522
0.1258
Roughly. Every run gives around the same result. Obviously all the data is in the cache,
but because of the easy parallelism you'd probably want to run through all your numbers at once and save the results for further usage.
I needed this and was wondering if there were clever solutions. After reading the first page I rather came up with my own solution,
so I've implemented the above instead. I don't need a lot of range, so 64k as a limit is fine. It's easily extended to 16megs range, too.
This can be further optimized and is going to be faster on Intel processors due to their lower access latency.
Please note that the function isn't being inlined. The code's probably ugly, but works.
I hope it helps someone! o/