Recent

Author Topic: Performances when searching in text files  (Read 6838 times)

sstvmaster

  • Sr. Member
  • ****
  • Posts: 306
Re: Performances when searching in text files
« Reply #15 on: February 28, 2022, 03:20:10 pm »
code from wp:
Code: Text  [Select][+][-]
  1. Time to read file: 00:00:03.665
  2. Time to count '46': 00:00:00.849; 20000000 lines found.
  3.  

I3-9100 (4C/4T) @ 3,6GHz with SSD
greetings Maik

Windows 10,
- Lazarus 4.4 (stable) + fpc 3.2.2 (stable)
- Lazarus 4.99 (trunk) + fpc 3.3.1 (main/trunk)

wp

  • Hero Member
  • *****
  • Posts: 13352
Re: Performances when searching in text files
« Reply #16 on: February 28, 2022, 05:58:23 pm »
Of course, the time reported by avk is the total time, reading plus searching; in my code they are separate.

Running avk's test I get:
Code: [Select]
Text file time: 00:00:04.475; 20000000 lines found.
TMemoryStream time: 00:00:00.976; 20000000 lines found.
TStringList time: 00:00:05.786; 20000000 lines found.

The stringlist time is longer than the sum of the reading and searching times reported above. As i noticed now my code was cheating a bit because I had taken the time measurement before Stringlist.Free which of course must be included.

arenzi

  • Newbie
  • Posts: 5
Re: Performances when searching in text files
« Reply #17 on: February 28, 2022, 06:39:04 pm »
TMemoryStream: 1.5 seconds on my machine !!!

avk

  • Hero Member
  • *****
  • Posts: 825
Re: Performances when searching in text files
« Reply #18 on: February 28, 2022, 06:54:03 pm »
Yes, just keep in mind that it is not safe, it would probably be better to declare it something like this:
Code: Pascal  [Select][+][-]
  1. procedure Search2;
  2. var
  3.   cnt: Integer = 0;
  4.   procedure ProcessMem(p: PByte; aBufSize: Integer);
  5.   var
  6.     pLast: PByte;
  7.     I: Integer;
  8.     s: shortstring;
  9.   begin
  10.     pLast := p + aBufSize;
  11.     s := '';
  12.     I := 0;
  13.     while p < pLast do
  14.       begin
  15.         if p^ in [0, 10, 13] then
  16.           begin
  17.             if I <> 0 then
  18.               begin
  19.                 s[0] := char(I);
  20.                 Inc(cnt, Ord(Pos('46', s) > 0));
  21.                 I := 0;
  22.               end
  23.           end
  24.         else
  25.           begin
  26.             if I = 255 then
  27.               raise Exception.Create('Too long input string');
  28.             s[I+1] := char(p^);
  29.             Inc(I);
  30.           end;
  31.         Inc(p);
  32.       end;
  33.   end;
  34. var
  35.   Start: TTime;
  36. begin
  37.   Start := Time;
  38.   with TMemoryStream.Create do
  39.     try
  40.       LoadFromFile('test.txt');
  41.       ProcessMem(Memory, Size);
  42.     finally
  43.       Free;
  44.     end;
  45.   WriteLn('TMemoryStream time: ', FormatDateTime('hh:nn:ss.zzz', Time - Start), '; ', cnt, ' lines found.');
  46. end;
  47.  

BobDog

  • Sr. Member
  • ****
  • Posts: 394
Re: Performances when searching in text files
« Reply #19 on: March 01, 2022, 12:52:59 am »

Self contained for windows.
1 second for 85 mb file.
you can use your own test.txt, comment out line 160
Code: Pascal  [Select][+][-]
  1.  
  2. {$GOTO ON}
  3. {$mode delphi}
  4. uses
  5. sysutils;
  6.  
  7. function fopen (p1:pchar; p2:pchar):pointer ; cdecl external 'msvcrt.dll' name 'fopen';
  8.     function fread (p1:pointer;i1:integer;i2:integer;_FILE: pointer):integer ; cdecl external 'msvcrt.dll' name 'fread';
  9.     function fwrite (p1:pointer;i1:integer;i2:integer;_FILE: pointer):integer ; cdecl external 'msvcrt.dll' name 'fwrite';
  10.     function fseek (_FILE:pointer;i1:integer;i2:integer):integer ; cdecl external 'msvcrt.dll' name 'fseek';
  11.     function ftell (_FILE:pointer):integer; cdecl external 'msvcrt.dll' name 'ftell';
  12.     function fclose (_FILE:pointer):integer   cdecl external 'msvcrt.dll' name 'fclose';
  13. Type  
  14.   intArray = Array of longword;
  15.  
  16.  function instr(somestring:ansistring;partstring:ansistring):boolean;
  17. var
  18. i,j,ln,lnp:longword;
  19. label
  20. skip;
  21. begin
  22. if (somestring='') then exit(false);
  23. ln:=length(somestring);
  24. lnp:=length(partstring);
  25. i:=0;
  26. repeat
  27. i:=i+1;
  28.    if somestring[i] <> partstring[1] then goto skip ;
  29.      if somestring[i] = partstring[1] then
  30.      begin
  31.      for j:=0 to lnp-1 do
  32.      begin
  33.      if somestring[j+i]<>partstring[j+1] then goto skip;
  34.      end;
  35.      exit(true);
  36.       i:=i+lnp-1;
  37.      end ;
  38.    skip:
  39.    until i>=ln-0 ;
  40.   exit(false);
  41. end;  
  42.  
  43.   function tally(somestring:ansistring;partstring:ansistring;var arr:intarray ):longword;
  44. var
  45. i,j,ln,lnp,count,num:longword;
  46. filler:boolean=false;
  47. label
  48. skip,start,return;
  49. begin
  50. ln:=length(somestring);
  51. lnp:=length(partstring);
  52. start:
  53. count:=0;
  54. i:=0;
  55. repeat
  56. i:=i+1;
  57.    if somestring[i] <> partstring[1] then goto skip ;
  58.      if somestring[i] = partstring[1] then
  59.      begin
  60.      for j:=0 to lnp-1 do
  61.      begin
  62.      if somestring[j+i]<>partstring[j+1] then goto skip;
  63.      end;
  64.       count:=count+1;
  65.       if filler = true then arr[count]:=i ;
  66.       i:=i+lnp-1;
  67.      end ;
  68.    skip:
  69.    until i>=ln-0 ;
  70.    SetLength(arr,count);
  71.    arr[0]:=count;
  72.   num:=count;
  73.   if filler=true then goto return;
  74. filler:=true;
  75.   goto start;
  76.    return:
  77.   result:=num;
  78. end; {tally}
  79.  
  80. function filelen(filename:pchar):integer;
  81.      var
  82.      fp:pointer;
  83.      r:pchar='rt+';
  84.      length:integer;
  85.      SEEK_END:integer=2;
  86.      begin
  87.  
  88.      fp:=fopen(filename,r);
  89.       if fp = nil then
  90.        begin
  91.        writeln( 'Unable to open  ',filename);
  92.        exit
  93.        end;
  94.        fseek(fp, 0, SEEK_END);
  95.        length:=ftell(fp);
  96.     fclose(fp);
  97.     exit(length);
  98.     end;
  99.  
  100.  
  101.  
  102. procedure savefilestring(content:ansistring;filename:pchar);
  103.        var
  104.        w:pchar='wb';
  105.        fp:pointer;
  106.        begin
  107.        fp:=fopen(filename,w);
  108.        if fp = nil then
  109.        begin
  110.        writeln( 'Unable to save  ',filename);
  111.        exit
  112.        end;
  113.         fwrite(@content[1], 1, length(content), fp);
  114.         fclose(fp);
  115.        end;
  116.        
  117.        procedure loadfilestring(var content:ansistring;filename:pchar);
  118.        var
  119.        w:pchar='rb';
  120.        fp:pointer;
  121.        l:longint;
  122.        begin
  123.        l:=filelen(filename);
  124.        setlength(content,l);
  125.        fp:=fopen(filename,w);
  126.        if fp = nil then
  127.        begin
  128.        writeln( 'Unable to open  ',filename);
  129.        exit
  130.        end;
  131.        fread(@content[1], 1,l, fp);
  132.         fclose(fp);
  133.        end;
  134.        
  135.        procedure createfile(filename:ansistring);
  136.        var
  137.        i:longint;
  138.        s:ansistring='1234567890'+#10+'44466'+#10;
  139.        g:ansistring='';
  140.        begin
  141.        writeln('creating file . . .');
  142.        for i:=1 to 5000000 do g:=g+s;
  143.        writeln('string created');
  144.        savefilestring(g,pchar(filename));
  145.        writeln('saved');
  146.        end;
  147.        
  148.        
  149.        var
  150.        i,t,count:longint;
  151.        g:ansistring='';
  152.        tmp:ansistring='';
  153.        a1:intarray;
  154.        tm:int64;
  155.        
  156.        begin
  157.        count:=0;
  158.        
  159.       createfile('test.txt'); //<< do once
  160.        
  161.        
  162.        
  163.        loadfilestring(g,'test.txt');
  164.        
  165.     tm:=gettickcount64;
  166.    
  167.     t:=tally(g,#10,a1);
  168.     writeln('Number of lines ',t);
  169.     for i:=1 to high(a1) do
  170.     begin
  171.     tmp:=g[a1[i]..a1[i+1]];
  172.     if instr(tmp,'46') then count:=count+1;
  173.    
  174.     end;
  175.     writeln('Number of lines containing 46 =  ',count);
  176.     writeln('Time taken (load and examine)  ',(gettickcount64-tm)/1000,'  seconds');
  177.     writeln('File size = ',filelen('test.txt') div 1000000,' mb');
  178.     writeln;
  179.     writeln('sample of file');
  180.     writeln(leftstr(g,198));
  181.     writeln('Press return to finish');
  182.     readln;
  183.        end.
  184.  
  185.  
Tested 64 bit and 32 bit  freepascal 3.2.2

Thaddy

  • Hero Member
  • *****
  • Posts: 18729
  • To Europe: simply sell USA bonds: dollar collapses
Re: Performances when searching in text files
« Reply #20 on: March 01, 2022, 10:33:17 am »
So the slowliness was from the Readln, not from the search.
Quite possibly {$I-} may improve the speed considerably for readln/writeln. At least here it does.
But note then you will have to handle any errors without exceptions.
If Europe sells their USA bonds the USD will collapse. Europe can affort that given average state debts. The USA can't affort that. Just an advice...

BobDog

  • Sr. Member
  • ****
  • Posts: 394
Re: Performances when searching in text files
« Reply #21 on: March 02, 2022, 03:34:51 pm »

I have this down to about 2.75 seconds (load file and count the required things)
I don't need tstringlist because I don't need an array or a list.
If you don't have the test file, you can create it (line approx 93)
Code: Pascal  [Select][+][-]
  1. program testfile;
  2.  
  3. uses
  4. sysutils; // for timer
  5.  
  6. Type  
  7.   intArray = Array of longword;
  8.  
  9. function filelength(filename:ansistring):longword;
  10. Var F : File Of byte;
  11. var L:longword;
  12. begin
  13. Assign (F,filename);
  14.   Reset (F);
  15.   L:=FileSize(F);
  16.   Close (F);
  17.   exit(L);
  18. end;
  19.  
  20.  function tally(somestring:ansistring;partstring:ansistring;var arr:intarray ):longint;
  21. var
  22. i,j,ln,lnp,count:longint;
  23. begin
  24. ln:=length(somestring);
  25. lnp:=length(partstring);
  26. count:=-1;
  27. i:=0;
  28. SetLength(arr,ln);
  29. repeat
  30. i:=i+1;
  31.    if somestring[i] <> partstring[1] then continue ;
  32.      if somestring[i] = partstring[1] then
  33.      begin
  34.      for j:=0 to lnp-1 do
  35.      begin
  36.      if somestring[j+i]<>partstring[j+1] then continue;
  37.      end;
  38.       count:=count+1;
  39.        arr[count]:=i ;
  40.       i:=i+lnp-1;
  41.      end ;
  42.    until i>=ln ;
  43.   setlength(arr,count+1);
  44.   exit(count+1);
  45. end;
  46.  
  47.  procedure loadfile(var content: ansistring;filename:ansistring);
  48.    Var Fin : File;
  49.    Var x:longint;
  50.    begin
  51.    x:=filelength(filename);
  52.    setlength(content,x);
  53.    Assign (Fin,filename);
  54.    Reset (Fin,x);
  55.    BlockRead (Fin,content[1],1);
  56.    close(fin);
  57. end;
  58.  
  59. procedure savefile(s:ansistring ;filename:ansistring);
  60.         var
  61.         fout:file;
  62.         begin
  63.         Assign(fout,filename);
  64.         Rewrite(fout,length(s));
  65.         blockwrite(fout,s[1],1);
  66.         close(fout);
  67.         end;
  68.  
  69.  procedure createfile(filename:ansistring);
  70.        var
  71.        s:ansistring='1234567890'+#10+'44466'+#10;
  72.        begin
  73.        writeln('creating file . . .');
  74.        while length(s)<=350000000 do
  75.        begin
  76.        s:=s+s;
  77.        end;
  78.        s:=leftstr(s,340000000);
  79.        writeln('string created');
  80.        savefile(s,filename);
  81.        writeln('saved');
  82.        end;
  83.        
  84.        var
  85.        i,t,count:longint;
  86.        g:ansistring='';
  87.        a1:intarray;
  88.        tm,tm2,tm3:int64;
  89.        filename:ansistring='test.txt';
  90.        
  91.        begin
  92.        
  93.       // createfile('test.txt');
  94.      
  95.      count:=0;
  96.      
  97.      tm:=gettickcount64; //start timing
  98.      loadfile(g,filename);
  99.      tm2:=gettickcount64;
  100.      writeln('Time taken load the file ',(tm2-tm)/1000,'  seconds');
  101.      
  102.      tm2:=gettickcount64;
  103.     t:=tally(g,#10,a1);
  104.     writeln('Number of lines ',t);
  105.     // examine each line
  106.     if (pos('46',g[1..a1[0]])>0)  then count:=count+1;
  107.     for i:=0 to high(a1) do if (pos('46',g[a1[i]..a1[i+1]])>0)  then count:=count+1;
  108.     tm3:=gettickcount64; //end timing
  109.    
  110.     writeln('Number of lines containing 46 =  ',count);
  111.     writeln;
  112.     writeln('Time taken to count lines and count lines containing 46  ',(tm3-tm2)/1000 ,'  seconds');  
  113.     writeln('Total time taken  ',(tm3-tm2)/1000 + (tm2-tm)/1000,' seconds');
  114.     writeln('File size = ',filelength(filename) div 1000000,' mb');
  115.     writeln;
  116.     writeln('sample of file,starting few and ending few ');
  117.     writeln(leftstr(g,4*17));
  118.     writeln('. . .');
  119.     writeln('. . .');
  120.     writeln;
  121.     writeln(rightstr(g,4*17));
  122.     writeln('Press return to finish');
  123.     readln;
  124.    
  125.        end.
  126.  
  127.  

Thaddy

  • Hero Member
  • *****
  • Posts: 18729
  • To Europe: simply sell USA bonds: dollar collapses
Re: Performances when searching in text files
« Reply #22 on: March 02, 2022, 07:28:03 pm »
{$I-} !!!!!!
If Europe sells their USA bonds the USD will collapse. Europe can affort that given average state debts. The USA can't affort that. Just an advice...

BobDog

  • Sr. Member
  • ****
  • Posts: 394
Re: Performances when searching in text files
« Reply #23 on: March 02, 2022, 08:52:24 pm »

Hello Thaddy.
No problem here surely, if the file is not there then use createfile to make one.
However I take note for future block files:

assign (f,'file.txt'); 
{$I-} 
rewrite (f); 
{$I+} 
if IOResult<>0 then 
  begin 
  Writeln ('Error opening file: "file.txt"'); 
  exit 
  end;

Thank you.
I think I have the fastest method so far.


wp

  • Hero Member
  • *****
  • Posts: 13352
Re: Performances when searching in text files
« Reply #24 on: March 02, 2022, 11:31:51 pm »
I think I have the fastest method so far.
Maybe you are not aware that this method is cheating a bit. Because it simply counts the occurencies of the search string. But in your first post you said that you want to count the LINES containing the search string. All the other methods worked with individual lines, and avk's Search2 separated the read buffer into individual lines. Searching for the line endings takes extra time, and therefore will be slower by some degree than the simple search in a very long string.

BobDog

  • Sr. Member
  • ****
  • Posts: 394
Re: Performances when searching in text files
« Reply #25 on: March 02, 2022, 11:55:09 pm »

Code: Pascal  [Select][+][-]
  1. No wp, not cheating
  2.  
  3.    if (pos('46',g[1..a1[0]])>0)  then count:=count+1;
  4.     for i:=0 to high(a1) do if (pos('46',g[a1[i]..a1[i+1]])>0)  then count:=count+1;
  5.  
  6. My tally function returns the number of required substrings in the string, and the array saves all their positions.
  7. When I search for 46, first I search the beginning from index[1] to the first position of #10 (a1[0]).
  8. Then I search every other line in the string given by:
  9.  
  10.  if (pos('46',g[a1[i]..a1[i+1]])>0)  where g[a1[i] .. a1[i+1]] gets every line.
  11. So I dont need the string in an array, which saves time.
  12.  
  13.  
Sorry i had to put this in a code block, it is not printing correctly otherwise.

wp

  • Hero Member
  • *****
  • Posts: 13352
Re: Performances when searching in text files
« Reply #26 on: March 03, 2022, 01:05:35 am »
Oh you're right. Sorry.

Josh

  • Hero Member
  • *****
  • Posts: 1454
Re: Performances when searching in text files
« Reply #27 on: March 03, 2022, 03:43:39 am »
Just 4 fun

Attached project, running on HP Laptop

sub 1.5 seconds for read and analyze, you might get better or worse, as this cheap lappy with ssd astonishes me sometimes,

it reads the data into byte array, analyzes it and fill array, with the linenumber, position of start of line number in the byte array, aswell as position in the line of any matche(s), all data and working variables is held in a record.
« Last Edit: March 03, 2022, 03:53:26 am by josh »
The best way to get accurate information on the forum is to post something wrong and wait for corrections.

BobDog

  • Sr. Member
  • ****
  • Posts: 394
Re: Performances when searching in text files
« Reply #28 on: March 03, 2022, 04:13:31 pm »
Hi josh.
I don't use Lazraus to code, but I have a copy to test.
Your routine is very fast.
Inspired, I coded this in Geany, where I have a dedicated function tally to
1) count the number of lines in a file.
2) search each line for a particular string (46 here)
I do it in about 1.5 seconds here, O2 optimization.
If you don't have test.txt, you can create it (line approx 89)
Code: Pascal  [Select][+][-]
  1. program textsearch;
  2. {$GOTO ON}
  3. uses
  4. sysutils; // for timer
  5.  
  6. function tally(somestring:ansistring;partstring:ansistring;var ncount:longint):longint;
  7. var
  8. i,j,ln,lnp,count:longint;
  9. k,m,lasti:longint;
  10. label
  11. skip;
  12. begin
  13. ln:=length(somestring);
  14. lnp:=length(partstring);
  15. count:=-1;
  16. i:=0;
  17. lasti:=1;
  18. repeat
  19. i:=i+1;
  20.    if ord(somestring[i]) <> 10 then continue ;
  21.       count:=count+1;
  22.       m:=-1;
  23.       for k:=lasti to i do
  24.       begin
  25.       m:=m+1;
  26.       if somestring[k] <> partstring[1] then goto skip ;
  27.        for j:=0 to lnp-1 do   if somestring[j+k]<>partstring[j+1] then goto skip;
  28.        ncount:=ncount+1;
  29.          skip:
  30.       end;
  31.       lasti:=i;
  32.    until i>=ln ;
  33.  
  34.   exit(count+1);
  35. end;
  36.  
  37.  
  38. function filelength(filename:ansistring):longword;
  39. Var F : File Of byte;
  40. var L:longword;
  41. begin
  42. Assign (F,filename);
  43.   Reset (F);
  44.   L:=FileSize(F);
  45.   Close (F);
  46.   exit(L);
  47. end;
  48.  
  49. procedure loadfile(var content: ansistring;filename:ansistring);
  50.    Var Fin : File;
  51.    Var x:longint;
  52.    begin
  53.    x:=filelength(filename);
  54.    setlength(content,x);
  55.    Assign (Fin,filename);
  56.    Reset (Fin,x);
  57.    BlockRead (Fin,content[1],1);
  58.    close(fin);
  59. end;
  60.  
  61.  procedure savefile(s:ansistring ;filename:ansistring);
  62.     var
  63.     fout:file;
  64.     begin
  65.     Assign(fout,filename);
  66.     Rewrite(fout,length(s));
  67.     blockwrite(fout,s[1],1);
  68.     close(fout);
  69.   end;
  70.        
  71.         procedure createfile(filename:ansistring); // optional
  72.        var
  73.        s:ansistring='1234567890'+#10+'44466'+#10;
  74.        begin
  75.        writeln('creating file . . .');
  76.        while length(s)<=350000000 do
  77.        begin
  78.        s:=s+s;
  79.        end;
  80.        s:=leftstr(s,340000000);
  81.        writeln('string created');
  82.        savefile(s,filename);
  83.        writeln('saved');
  84.        end;
  85.  
  86.  
  87.  var
  88.        t:longint;
  89.        icount:longint=0;
  90.        g:ansistring='';
  91.        tm:int64;
  92.        filename:ansistring='test.txt';
  93.  
  94. begin
  95.  
  96. //createfile(filename);
  97.  
  98.   tm:=gettickcount64;
  99.   loadfile(g,filename);
  100.   t:=tally(g,'46',icount);
  101.   writeln('Number of lines = ',t,'  Number of lines containing 46 = ',icount);
  102.   writeln('time taken ',(gettickcount64-tm)/1000);
  103.   writeln('File size = ',filelength(filename) div 1000000,' mb');
  104.   writeln;
  105.   writeln;
  106.   writeln('sample of file,starting few and ending few ');
  107.   writeln(leftstr(g,4*17));
  108.   writeln('. . .');
  109.   writeln('. . .');
  110.   writeln;
  111.   writeln(rightstr(g,4*17));
  112.   writeln('Press return to finish');
  113.   readln;
  114.    
  115. end.
  116.  
« Last Edit: March 04, 2022, 12:15:29 pm by BobDog »

 

TinyPortal © 2005-2018