* * *

Author Topic: StringReplace extremely slow with big strings  (Read 19218 times)

jarto

  • Full Member
  • ***
  • Posts: 106
Re: StringReplace extremely slow with big strings
« Reply #15 on: October 16, 2014, 07:49:42 pm »
you need more efficient substring search than PosEx().

This is too funny. PosEx() uses indexbyte to find the first byte. It's even assembler code.
Pos() uses a simple while loop and is faster :-D

CM630

  • Hero Member
  • *****
  • Posts: 827
  • Не съм сигурен, че те разбирам.
    • http://sourceforge.net/u/cm630/profile/
Re: StringReplace extremely slow with big strings
« Reply #16 on: January 22, 2018, 12:36:08 pm »
Hi, any idea when will these fixes will be applied in an official Lazarus release?
Лазар 1,8,0;W7 64bit or XP 32bit;FPC3,0,4;rev 56594

ASerge

  • Hero Member
  • *****
  • Posts: 903
Re: StringReplace extremely slow with big strings
« Reply #17 on: January 22, 2018, 04:03:36 pm »
If the unit posex is in is already referenced by/present in the unit stringreplace is in (sorry to lazy to look it up now) I don't see a problem?
New StringReplace use PosEx, which in StrUtils, which uses SysUtils, which contain StringReplace.
Of course, the cyclic dependency can be broken through the implementation section, but I do not like this decision.

marcov

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 6506
Re: StringReplace extremely slow with big strings
« Reply #18 on: January 22, 2018, 04:06:44 pm »
Afaik in some recent version POS() has gotten an extra parameter and is now posex like. Might change the cycle.

IIRC it is even merged to 3.0.4, but am not 100% sure.

RAW

  • Hero Member
  • *****
  • Posts: 672
Re: StringReplace extremely slow with big strings
« Reply #19 on: January 22, 2018, 07:23:46 pm »
Maybe this works for you....
Windows 7 Pro (x64 Sp1) And Windows XP Pro (x86 Sp3) - LAZARUS 1.8.4 FPC 3.0.4 // 1.7 FPC 3.1.1

CM630

  • Hero Member
  • *****
  • Posts: 827
  • Не съм сигурен, че те разбирам.
    • http://sourceforge.net/u/cm630/profile/
Re: StringReplace extremely slow with big strings
« Reply #20 on: January 23, 2018, 08:08:36 am »
Thanks, I looked at the code... since it uses Posex, I have a bad feeling it won't works with UTF8... ?

Until now I used this code from Lazarus bugzilla:

Code: Pascal  [Select]
  1. //ANSI
  2. Function StringReplaceNew(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
  3. var
  4.   OldPat,Srch: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
  5.   PatLength,NewPatLength,P,Cnt,PatCount,PrevP: Integer;
  6.   c,d: pchar;
  7. begin
  8.   PatLength:=Length(OldPattern);
  9.   if PatLength=0 then begin
  10.     Result:=S;
  11.     exit;
  12.   end;
  13.  
  14.  
  15.   if rfIgnoreCase in Flags then begin
  16.     Srch:=AnsiUpperCase(S);
  17.     OldPat:=AnsiUpperCase(OldPattern);
  18.   end else begin
  19.     Srch:=S;
  20.     OldPat:=OldPattern;
  21.   end;
  22.  
  23.  
  24.   PatLength:=Length(OldPat);
  25.   if Length(NewPattern)=PatLength then begin
  26.     //Result length will not change
  27.     Result:=S;
  28.     P:=1;
  29.     repeat
  30.       P:=PosEx(OldPat,Srch,P);
  31.       if P>0 then begin
  32.         move(NewPattern[1],Result[P],PatLength);
  33.         if not (rfReplaceAll in Flags) then exit;
  34.         inc(P,PatLength);
  35.       end;
  36.     until p=0;
  37.   end else begin
  38.     //Different pattern length -> Result length will change
  39.     //To avoid creating a lot of temporary strings, we count how many
  40.     //replacements we're going to make.
  41.     P:=1; PatCount:=0;
  42.     repeat
  43.       P:=PosEx(OldPat,Srch,P);
  44.       if P>0 then begin
  45.         inc(P,PatLength);
  46.         inc(PatCount);
  47.         if not (rfReplaceAll in Flags) then break;
  48.       end;
  49.     until p=0;
  50.     if PatCount=0 then begin
  51.       Result:=S;
  52.       exit;
  53.     end;
  54.     NewPatLength:=Length(NewPattern);
  55.     SetLength(Result,Length(S)+PatCount*(NewPatLength-PatLength));
  56.     P:=1; PrevP:=0;
  57.     c:=pchar(Result); d:=pchar(S);
  58.     repeat
  59.       P:=PosEx(OldPat,Srch,P);
  60.       if P>0 then begin
  61.         Cnt:=P-PrevP-1;
  62.         if Cnt>0 then begin
  63.           Move(d^,c^,Cnt);
  64.           inc(c,Cnt);
  65.           inc(d,Cnt);
  66.         end;
  67.         if NewPatLength>0 then begin
  68.           Move(NewPattern[1],c^,NewPatLength);
  69.           inc(c,NewPatLength);
  70.         end;
  71.         inc(P,PatLength);
  72.         inc(d,PatLength);
  73.         PrevP:=P-1;
  74.         if not (rfReplaceAll in Flags) then break;
  75.       end;
  76.     until p=0;
  77.     Cnt:=Length(S)-PrevP;
  78.     if Cnt>0 then Move(d^,c^,Cnt);
  79.   end;
  80. end;    

but it makes mess with UTF8 strings (even though I am replacing ansi strings with ansi strings).
So I tried to modify it to:


Code: Pascal  [Select]
  1. //UTF8
  2. Function StringReplaceNew(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
  3. var
  4.   OldPat,Srch: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
  5.   PatLength,NewPatLength,P,Cnt,PatCount,PrevP: Integer;
  6.   c,d: pchar;
  7. begin
  8.   PatLength:=utf8Length(OldPattern);
  9.   if PatLength=0 then begin
  10.     Result:=S;
  11.     exit;
  12.   end;
  13.  
  14.  
  15.   if rfIgnoreCase in Flags then begin
  16.     Srch:=UTF8UpperCase(S);
  17.     OldPat:=UTF8UpperCase(OldPattern);
  18.   end else begin
  19.     Srch:=S;
  20.     OldPat:=OldPattern;
  21.   end;
  22.  
  23.  
  24.   PatLength:=utf8Length(OldPat);
  25.   if utf8Length(NewPattern)=PatLength then begin
  26.     //Result length will not change
  27.     Result:=S;
  28.     P:=1;
  29.     repeat
  30.       P:=utf8Pos(OldPat,Srch,P);
  31.       if P>0 then begin
  32.         move(NewPattern[1],Result[P],PatLength);
  33.         if not (rfReplaceAll in Flags) then exit;
  34.         inc(P,PatLength);
  35.       end;
  36.     until p=0;
  37.   end else begin
  38.     //Different pattern length -> Result length will change
  39.     //To avoid creating a lot of temporary strings, we count how many
  40.     //replacements we're going to make.
  41.     P:=1; PatCount:=0;
  42.     repeat
  43. //      P:=utf8Pos(OldPat,Srch,P);   //This counts only the occurences of the sougth string, so there is possibly no reason to use UFT8Pos, whish is extremely slow
  44.       P:=Posex(OldPat,Srch,P);
  45.       if P>0 then begin
  46.         inc(P,PatLength);
  47.         inc(PatCount);
  48.         if not (rfReplaceAll in Flags) then break;
  49.       end;
  50.     until p=0;
  51.     if PatCount=0 then begin
  52.       Result:=S;
  53.       exit;
  54.     end;
  55.     NewPatLength:=utf8Length(NewPattern);
  56.     SetLength(Result,utf8Length(S)+PatCount*(NewPatLength-PatLength));
  57.     P:=1; PrevP:=0;
  58.     c:=pchar(Result); d:=pchar(S);
  59.     repeat
  60.       P:=utf8Pos(OldPat,Srch,P);
  61.       if P>0 then begin
  62.         Cnt:=P-PrevP-1;
  63.         if Cnt>0 then begin
  64.           Move(d^,c^,Cnt);
  65.           inc(c,Cnt);
  66.           inc(d,Cnt);
  67.         end;
  68.         if NewPatLength>0 then begin
  69.           Move(NewPattern[1],c^,NewPatLength);
  70.           inc(c,NewPatLength);
  71.         end;
  72.         inc(P,PatLength);
  73.         inc(d,PatLength);
  74.         PrevP:=P-1;
  75.         if not (rfReplaceAll in Flags) then break;
  76.       end;
  77.     until p=0;
  78.     Cnt:=utf8Length(S)-PrevP;
  79.     if Cnt>0 then Move(d^,c^,Cnt);
  80.   end;
  81. end;
I have no idea if it works properly, but is extremely slow.  :'(  It seems to me that utf8Pos is the bottle neck. A very narrow one.
« Last Edit: January 23, 2018, 09:00:07 am by CM630 »
Лазар 1,8,0;W7 64bit or XP 32bit;FPC3,0,4;rev 56594

Thaddy

  • Hero Member
  • *****
  • Posts: 6736
Re: StringReplace extremely slow with big strings
« Reply #21 on: January 23, 2018, 09:10:24 am »
If you replace ansi with ansi, plz do NOT use string in Lazarus: that is UTF8. Rename string to AnsiString.
Ada's daddy wrote this:"Fools are my theme, let satire be my song."

CM630

  • Hero Member
  • *****
  • Posts: 827
  • Не съм сигурен, че те разбирам.
    • http://sourceforge.net/u/cm630/profile/
Re: StringReplace extremely slow with big strings
« Reply #22 on: January 23, 2018, 10:12:27 am »
@Thaddy, I am not sure that I understood you right and that you understood me. I am replacing Ansi with Ansi in an UTF8 string.

In other words: Strings which I am replacing are with ascii code <127 (basic Latin, &%$# etc). But the sting which contains them may also contain uтf8 characters.
« Last Edit: January 23, 2018, 10:31:54 am by CM630 »
Лазар 1,8,0;W7 64bit or XP 32bit;FPC3,0,4;rev 56594

JuhaManninen

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 3489
  • I like bugs.
Re: StringReplace extremely slow with big strings
« Reply #23 on: January 23, 2018, 12:01:32 pm »
If you replace ansi with ansi, plz do NOT use string in Lazarus: that is UTF8. Rename string to AnsiString.
Thaddy, in Lazarus by default String = AnsiString.
You should however use type String always because the code is then compatible with Delphi and with future mode DelphiUnicode solutions.
Remember, Ansi...() functions like AnsiUpperCase() work transparently with different encodings then.

If someone still uses the old Windows codepages, then please switch to Unicode.

CM630, please see the example:
 http://wiki.lazarus.freepascal.org/UTF8_strings_and_characters#Search_and_copy
Pos() and PosEx() work in StringReplaceNew for the same reasons they work in the given example.
CodeUnit (Pascal Char) resolution is still very useful with variable width encodings, and is fast.

CM630

  • Hero Member
  • *****
  • Posts: 827
  • Не съм сигурен, че те разбирам.
    • http://sourceforge.net/u/cm630/profile/
Re: StringReplace extremely slow with big strings
« Reply #24 on: January 23, 2018, 12:58:59 pm »

1. What I got from the example is that I should keep PosEx and Length in StringReplaceNew.


2. CodeUnit is some type of variable. But I could no find more info about it.
3. My current (old) implementation of StringReplaceNew messes the strings up.
4. I will try RAW's solution and feedback.
    Edit: I tried it. My app crashes with a range check error. :(
« Last Edit: January 23, 2018, 01:12:21 pm by CM630 »
Лазар 1,8,0;W7 64bit or XP 32bit;FPC3,0,4;rev 56594

JuhaManninen

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 3489
  • I like bugs.
Re: StringReplace extremely slow with big strings
« Reply #25 on: January 23, 2018, 01:11:37 pm »
2. CodeUnit is some type of variable. But I could no find more info about it.
No. It is a concept of Unicode. It is the smallest atomic building block. A code point can consist of one or more code units.
In Pascal terms it is type Char, a byte with UTF-8 and a word with UTF-16.
See:
 https://en.wikipedia.org/wiki/Character_encoding#Terminology
and search for "code unit".

engkin

  • Hero Member
  • *****
  • Posts: 2104
Re: StringReplace extremely slow with big strings
« Reply #26 on: January 24, 2018, 04:41:56 pm »
3. My current (old) implementation of StringReplaceNew messes the strings up.

You did not show the current implementation. I assume something like:
Code: Pascal  [Select]
  1. Function StringReplaceNew(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
  2. var
  3.   OldPat,Srch: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
  4.   PatLength,NewPatLength,P,Cnt,PatCount,PrevP: Integer;
  5.   c,d: pchar;
  6. begin
  7.   PatLength:=Length(OldPattern);
  8.   if PatLength=0 then
  9.     exit(S);
  10.  
  11.   if rfIgnoreCase in Flags then begin
  12.     Srch:=UTF8UpperCase(S);
  13.     OldPat:=UTF8UpperCase(OldPattern);
  14.   end else begin
  15.     Srch:=S;
  16.     OldPat:=OldPattern;
  17.   end;
  18.  
  19.   PatLength:=Length(OldPat);
  20.   if Length(NewPattern)=PatLength then begin
  21.     //Result length will not change
  22.     Result:=S;
  23.     P:=1;
  24.     repeat
  25.       P:=PosEx(OldPat,Srch,P);
  26.       if P>0 then begin
  27.         move(NewPattern[1],Result[P],PatLength);
  28.         if not (rfReplaceAll in Flags) then exit;
  29.         inc(P,PatLength);
  30.       end;
  31.     until p=0;
  32.   end else begin
  33.     //Different pattern length -> Result length will change
  34.     //To avoid creating a lot of temporary strings, we count how many
  35.     //replacements we're going to make.
  36.     P:=1; PatCount:=0;
  37.     repeat
  38.       P:=PosEx(OldPat,Srch,P);
  39.       if P>0 then begin
  40.         inc(P,PatLength);
  41.         inc(PatCount);
  42.         if not (rfReplaceAll in Flags) then break;
  43.       end;
  44.     until p=0;
  45.     if PatCount=0 then begin
  46.       Result:=S;
  47.       exit;
  48.     end;
  49.     NewPatLength := Length(NewPattern);
  50.     SetLength(Result, Length(S) + PatCount*(NewPatLength-PatLength));
  51.     P:=1; PrevP:=0;
  52.     c:=pchar(Result);
  53.     d:=pchar(S);
  54.     repeat
  55.       P:=PosEx(OldPat,Srch,P);
  56.       if P>0 then begin
  57.         Cnt:=P-PrevP-1;
  58.         if Cnt>0 then begin
  59.           Move(d^,c^,Cnt);
  60.           inc(c,Cnt);
  61.           inc(d,Cnt);
  62.         end;
  63.  
  64.         Move(NewPattern[1],c^,NewPatLength);
  65.         inc(c,NewPatLength);
  66.         inc(P,PatLength);
  67.         inc(d,PatLength);
  68.         PrevP:=P-1;
  69.         if not (rfReplaceAll in Flags) then break;
  70.       end;
  71.     until p=0;
  72.     Cnt:=Length(S)-PrevP;
  73.     Move(d^,c^,Cnt);
  74.   end;
  75. end;

I believe you have a problem with rfIgnoreCase. Your code assumes that Srch and S have the same length. Upper case and lower case do *not* necessarily have the same length. Check the source code for UTF8UpperCase:
Code: Pascal  [Select]
  1.       if IsTurkish and (AInStr[InCounter] = 'i') then
  2.       begin
  3.         SetLength(Result,Length(Result)+1);// Increase the buffer
  4. ...
  5.         inc(InCounter);
  6.         inc(OutCounter,2);
and other calls to CorrectOutStrSize. You'll have to build the index for both strings and shift the location based on it.

Edit:
UTF8StringReplace from unit LazUTF8 has the same bug. It uses UTF8LowerCase instead.

Code: Pascal  [Select]
  1.   { To test the bug with UTF8StringReplace }
  2.   { LATIN CAPITAL LETTER A WITH STROKE' (U+23A) Ⱥ}
  3.   SetLength(sC8BA,2);
  4.   sC8BA[1] := #$C8;
  5.   sC8BA[2] := #$BA; //or #$BE
  6.  
  7.   { To test the bug with NewStringReplace }
  8.   { LATIN SMALL LETTER TURNED ALPHA' (U+0252) ɒ}
  9.   SetLength(sC992,2);
  10.   sC992[1] := #$C9;
  11.   sC992[2] := #$92;
« Last Edit: January 24, 2018, 06:27:06 pm by engkin »

JuhaManninen

  • Global Moderator
  • Hero Member
  • *****
  • Posts: 3489
  • I like bugs.
Re: StringReplace extremely slow with big strings
« Reply #27 on: January 25, 2018, 10:45:22 pm »
I believe you have a problem with rfIgnoreCase. Your code assumes that Srch and S have the same length. Upper case and lower case do *not* necessarily have the same length.
True. It also means functions UTF8SwapCase and my recently added UTF8ProperCase in unit LazUTF8 are buggy.
On the positive side the lengths are the same in most cases but that is no excuse for a bug of course.

Quote
UTF8StringReplace from unit LazUTF8 has the same bug. It uses UTF8LowerCase instead.
Ok, I didn't realize that one. The function has been there forever.

If somebody wants to fix those bugs, please do. I can commit the fixes.

CM630

  • Hero Member
  • *****
  • Posts: 827
  • Не съм сигурен, че те разбирам.
    • http://sourceforge.net/u/cm630/profile/
Re: StringReplace extremely slow with big strings
« Reply #28 on: January 26, 2018, 08:44:12 am »
You did not show the current implementation. I assume something like:
I did in post 20 (first function). Or you mean sth. else?


I believe you have a problem with rfIgnoreCase. Your code assumes that Srch and S have the same length. Upper case and lower case do *not* necessarily have the same length. Check the source code for UTF8UpperCase:
I think your belief is right! I removed all rfIgnoreCase and now I do not notice any corruption of the string (but I cannot say for sure that there is not any).

On the positive side the lengths are the same in most cases but that is no excuse for a bug of course.

On the negative side StringReplace will never work for Turkish and some other languages.
« Last Edit: January 26, 2018, 12:18:07 pm by CM630 »
Лазар 1,8,0;W7 64bit or XP 32bit;FPC3,0,4;rev 56594

engkin

  • Hero Member
  • *****
  • Posts: 2104
Re: StringReplace extremely slow with big strings
« Reply #29 on: January 27, 2018, 05:20:08 am »
Here is a possible replacement for UTF8StringReplace. It needs to be tested.
Code: Pascal  [Select]
  1. function UTF8StringReplace_1(const S, OldPattern, NewPattern: string;  Flags: TReplaceFlags): string;
  2. var
  3.   Matches: array of integer;
  4.   OldPat,Srch: string; // Srch and Oldp can contain uppercase versions of S,OldPattern
  5.   PatLength,P,Count, Capacity, i: Integer;
  6.   p1,p2, MemPos: pchar;
  7.   i1,i2, l: integer;
  8. begin
  9.   if (Length(OldPattern)=0) or (Length(S)=0) then
  10.     exit(S);
  11.  
  12.   if rfIgnoreCase in Flags then begin
  13.     Srch := UTF8LowerCase(S);
  14.     OldPat := UTF8LowerCase(OldPattern);
  15.   end else begin
  16.     Srch := S;
  17.     OldPat := OldPattern;
  18.   end;
  19.  
  20.   { Find matches }
  21.   Count := 0;
  22.   Capacity := 10;
  23.   SetLength(Matches, Capacity);
  24.   P:=PosEx(OldPat, Srch, 1);
  25.   if P=0 then exit(s);
  26.   if rfReplaceAll in Flags then  { TODO : Consider using FindMatchesBoyerMooreCaseSensitive }
  27.     while p<>0 do begin
  28.       Matches[Count] := p;
  29.       inc(Count);
  30.       if Count=Capacity then begin { Grow }
  31.         inc(Capacity, 10);
  32.         SetLength(Matches, Capacity);
  33.       end;
  34.       P:=PosEx(OldPat,Srch,P+1);
  35.     end
  36.   else begin
  37.       Matches[Count] := p;
  38.       inc(Count);
  39.   end;
  40.  
  41.   if rfIgnoreCase in Flags then begin { Correct match positions }
  42.     p1 := @Srch[1]; p2 := @S[1];
  43.     for i := 0 to Count-1 do begin
  44.       MemPos := @Srch[Matches[i]];
  45.       while p1<MemPos do begin
  46.         inc(p1, UTF8CharacterLengthFast(p1));
  47.         inc(p2, UTF8CharacterLengthFast(p2));
  48.       end;
  49.       Matches[i] := p2-@S[1]+1;
  50.     end;
  51.   end;
  52.  
  53.   PatLength := Length(OldPat);
  54.   if Length(NewPattern)=PatLength then begin
  55.     //Result length will not change
  56.     Result:=S;
  57.     if (rfReplaceAll in Flags) then
  58.       for i := 0 to Count-1 do move(NewPattern[1], Result[Matches[i]], PatLength)
  59.     else
  60.       move(NewPattern[1], Result[Matches[0]], PatLength);
  61.   end else begin
  62.     SetLength(Result, Length(S) + Count*(Length(NewPattern)-Length(OldPattern)));
  63.     i1 := 1; i2 := 1;
  64.     for i := 0 to Count-1 do begin
  65.       l := Matches[i]-i1;
  66.       move(S[i1], Result[i2], l); { Copy text before pattern }
  67.       inc(i2, l); { Move to the location of the pattern }
  68.       i1 := Matches[i]+Length(OldPattern); { Move over the old pattern }
  69.  
  70.       move(NewPattern[1], Result[i2], Length(NewPattern));  { Copy the new pattern }
  71.       inc(i2, Length(NewPattern));
  72.     end;
  73.     if i1<=Length(S) then
  74.       move(S[i1], Result[i2], Length(S)-i1+1); { Copy leftover text }
  75.   end;
  76. end;

 

Recent

Get Lazarus at SourceForge.net. Fast, secure and Free Open Source software downloads Open Hub project report for Lazarus