Recent

Author Topic: TextMan - some useful Text Manipulation routines  (Read 5311 times)

munair

  • Hero Member
  • *****
  • Posts: 798
  • compiler developer @SharpBASIC
    • SharpBASIC
TextMan - some useful Text Manipulation routines
« on: April 09, 2015, 11:06:36 pm »
One of the routines included in the source is InStr, a powerful function well known by (Visual) Basic programmers (I really missed an equivalent in Pascal).

All routines support Unicode UTF8.

Have fun!

EDIT: Case Sensitivity added to Replace and ReplaceAll

Code: Pascal  [Select][+][-]
  1. unit uTextMan;
  2.  
  3. { This sourcecode is released under the GPLv3.
  4.   Written by Mematis
  5.   http://www.mematis.com }
  6.  
  7. {$mode objfpc}{$H+}
  8.  
  9. interface
  10.  
  11. uses
  12.    Classes, SysUtils, lazutf8;
  13.  
  14. type
  15.   TStrArray = array of string;
  16.  
  17. const
  18.   // Instr mode flags
  19.   isForward  = 0;
  20.   isBackward = 1;
  21.   isNumber   = 2;
  22.   isNoCase   = 4;
  23.   // useful filters for TextItems
  24.   tiNumberDigits = '0123456789';
  25.   tiRealNumberDigits = '.' + tiNumberDigits;
  26.  
  27. procedure Replace(var source: string; const oldstr, newstr: string;
  28.   CaseSensitive: boolean = true);
  29. procedure ReplaceAll(var source: string; const oldstr, newstr: string;
  30.   CaseSensitive: boolean = true);
  31. function InStr(const SourceString: string; const SearchString: string; const
  32.   SearchMode: longint = isForward; const OrdSelector: longint = 0): longint;
  33. function InStr(StartPos: longint; const SourceString: string; const
  34.   SearchString: string; SearchMode: longint = isForward; const OrdSelector:
  35.   longint = 0): longint;
  36. function TextItems(const SourceString, SearchString: string): TStrArray;
  37.  
  38. implementation
  39.  
  40. procedure Replace(var source: string; const oldstr, newstr: string;
  41.   CaseSensitive: boolean = true);
  42. var
  43.   l, mode, p: longint;
  44. begin
  45.   mode := isForward;
  46.   if not CaseSensitive then
  47.     mode := mode or isNoCase;
  48.   p := InStr(source, oldstr, mode);
  49.   if p > 0 then
  50.     begin
  51.       l := UTF8Length(oldstr);
  52.       UTF8Delete(source, p, l);
  53.       UTF8Insert(newstr, source, p);
  54.     end;
  55. end;
  56.  
  57. procedure ReplaceAll(var source: string; const oldstr, newstr: string;
  58.   CaseSensitive: boolean = true);
  59. var
  60.   la, lb, mode, p: longint;
  61. begin
  62.   p := 0;
  63.   la := UTF8Length(oldstr);
  64.   lb := UTF8Length(newstr);
  65.   mode := isForward;
  66.   if not CaseSensitive then
  67.     mode := mode or isNoCase;
  68.   repeat
  69.     p := InStr(p, source, oldstr, mode);
  70.     if p > 0 then
  71.       begin
  72.         UTF8Delete(source, p, la);
  73.         UTF8Insert(newstr, source, p);
  74.         p := p + lb;
  75.       end;
  76.   until p = 0;
  77. end;
  78.  
  79. function InStr(StartPos: longint; const SourceString: string; const
  80.   SearchString: String; SearchMode: longint = isForward; const OrdSelector:
  81.   longint = 0): longint;
  82. { - Originally written by Kevin Provance:
  83.     http://www.vincenzo.net/isxkb/index.php?title=Instr_for_Pascal.
  84.   - Modifcations including UTF8 support by Mematis:
  85.     http://www.mematis.com }
  86. var
  87.   a, b, i, p: longint;
  88.   tmpStr1, tmpStr2, Str1: string;
  89. begin
  90.   result := 0;
  91.  
  92.   if Length(SourceString) = 0 then
  93.     exit;
  94.  
  95.   tmpStr1 := SourceString;
  96.   tmpStr2 := SearchString;
  97.  
  98.   if (SearchMode and isBackward) = 0 then
  99.     SearchMode := (SearchMode or isForward);
  100.  
  101.   if (SearchMode and isNoCase) <> 0 then
  102.     begin
  103.       tmpStr1 := LowerCase(tmpStr1);
  104.       tmpStr2 := LowerCase(tmpStr2);
  105.     end;
  106.  
  107.   p := OrdSelector;
  108.   if p = 0 then
  109.     p := 1;
  110.  
  111.   p := p - 1;
  112.  
  113.   case (SearchMode and 1) of
  114.     isForward:
  115.       begin
  116.         if StartPos = 0 then
  117.           StartPos := 1;
  118.         a := StartPos;
  119.         b := UTF8Length(tmpStr1);
  120.         for i := a to b do
  121.           begin
  122.             Str1 := UTF8Copy(tmpStr1, i, UTF8Length(tmpStr2));
  123.             if Str1 = tmpStr2 then
  124.               begin
  125.                 if (SearchMode and isNumber) <> 0 then
  126.                   result := result + 1
  127.                 else
  128.                   begin
  129.                     result := i;
  130.                     if p = 0 then
  131.                       exit
  132.                     else
  133.                       p := p - 1;
  134.                   end;
  135.               end;
  136.           end;
  137.       end;
  138.     isBackward:
  139.       begin
  140.         if StartPos = 0 then
  141.           StartPos := UTF8Length(tmpStr1);
  142.         a := StartPos;
  143.         b := 1;
  144.         for i := a downto b do
  145.           begin
  146.             Str1 := UTF8Copy(tmpStr1, i, UTF8Length(tmpStr2));
  147.             if Str1 = tmpStr2 then
  148.               begin
  149.                 if (SearchMode and isNumber) <> 0 then
  150.                   begin
  151.                     result := result + 1;
  152.                   end
  153.                 else
  154.                   begin
  155.                     result := (a - i) + UTF8Length(tmpStr2);
  156.                     if p = 0 then
  157.                       exit
  158.                     else
  159.                       p := p - 1;
  160.                   end;
  161.               end;
  162.           end;
  163.       end;
  164.   end;
  165.   if (SearchMode and isNumber) = 0 then
  166.     result := 0;
  167. end;
  168.  
  169. function InStr(const SourceString: string; const SearchString: string; const
  170.   SearchMode: longint = isForward; const OrdSelector: longint = 0): longint;
  171. // in case you do not need the start position
  172. begin
  173.   result := Instr(0, SourceString, SearchString, SearchMode, OrdSelector);
  174. end;
  175.  
  176. function TextItems(const SourceString, SearchString: string): TStrArray;
  177. // If nothing is found, an empty array is returned (upperbound -1).
  178. { This function can be used to find specific elements in a text, such as
  179.   date elements:
  180.  
  181.   s := '1\15\2015' // any fancy separator other than a number
  182.   TStrArray := TextItems(s, tiNumberDigits);
  183.   -> TStrArray[0] = '1'
  184.   -> TStrArray[1] = '15'
  185.   -> TStrArray[2] = '2015' }
  186. var
  187.   t: TStrArray;
  188.   i, n: longint;
  189.   s: string;
  190.   match: boolean;
  191. begin
  192.   match := false;
  193.   n := -1;
  194.   for i := 1 to UTF8Length(SourceString) do
  195.     begin
  196.       s := UTF8Copy(SourceString, i, 1);
  197.       if UTF8Pos(s, SearchString) > 0 then
  198.         if match then
  199.           t[n] := t[n] + s
  200.         else
  201.           begin
  202.             inc(n);
  203.             SetLength(t, n + 1);
  204.             t[n] := s;
  205.             match := true;
  206.           end
  207.       else
  208.         if match then
  209.           match := false;
  210.     end;
  211.   result := t;
  212. end;
  213.  
  214. end.
  215.  

INSTR EXAMPLES:

Code: Pascal  [Select][+][-]
  1. var
  2.    lMarker: Integer;
  3.  
  4. const
  5.    sSource = 'how now, brown cow.';
  6.  
  7. // Forward search for 'now'. (5)
  8. lMarker := InStr(sSource, 'now', isForward);
  9.  
  10. // Backward search for the comma. (12)
  11. lMarker := InStr(sSource, ',', isBackward);
  12.  
  13. // Number of instances of the letter 'o'. (4)
  14. lMarker := InStr(sSource, 'o', isForward or isNumber);
  15.  
  16. // Position of the third instance of the letter 'o'. (12)
  17. lMarker := InStr(sSource, 'o', isForward, 3);
  18.  
  19. // Case insensitive search of the word 'cow' starting at the comma. (16)
  20. lMarker := InStr(8, sSource, 'CoW', isForward or isNoCase);
« Last Edit: October 17, 2017, 03:01:37 pm by Munair »
keep it simple

marcov

  • Administrator
  • Hero Member
  • *
  • Posts: 11459
  • FPC developer.
Re: TextMan - some useful Text Manipulation routines
« Reply #1 on: April 10, 2015, 10:54:57 am »
One of the routines included in the source is InStr, a powerful function well known by (Visual) Basic programmers (I really missed an equivalent in Pascal).

See the pos variants in strutils:

To start on another position:
Code: [Select]
Function PosEx(const SubStr, S: string; Offset: Cardinal): Integer;
Function PosEx(const SubStr, S: string): Integer;inline; // Offset: Cardinal = 1
Function PosEx(c:char; const S: string; Offset: Cardinal): Integer;
search backwards (and from another position:
Code: [Select]
Function RPosEX(C:char;const S : AnsiString;offs:cardinal):Integer; overload;
Function RPosex (Const Substr : AnsiString; Const Source : AnsiString;offs:cardinal) : Integer; overload;
Function RPos(c:char;const S : AnsiString):Integer; overload;
Function RPos (Const Substr : AnsiString; Const Source : AnsiString) : Integer; overload;
Search for any array of a set (or characters in a string)
Code: [Select]
function PosSet (const c:TSysCharSet;const s : ansistring ):Integer;
function PosSet (const c:string;const s : ansistring ):Integer;
function PosSetEx (const c:TSysCharSet;const s : ansistring;count:Integer ):Integer;
function PosSetEx (const c:string;const s : ansistring;count:Integer ):Integer;

The above are for one 1 character sets including utf8. (except the "char" variants that are obviously not multibyte safe)

For case insensitive work you need to use Lazarus routines. The FPC ones are native and don't follow Lazarus utf8 convention. (but use the standard 1 byte encoding on Windows)

The only thing that can go wrong is denormalized characters and the like, but your routine doesn't handle that either (you would need comparetext(str1,tmpstr2) instead of str1=tmpstr2 or something like that)

The trick to capitalize to do case insensitive is also a bit dangerous, this afaik doesn't work for certain Asian languages.

This is one reason why you want to separate straight searching and case sensitive searching in different routines. Their performance characteristics are very dissimilar.



munair

  • Hero Member
  • *****
  • Posts: 798
  • compiler developer @SharpBASIC
    • SharpBASIC
Re: TextMan - some useful Text Manipulation routines
« Reply #2 on: April 10, 2015, 01:07:42 pm »
Thanks Markov,

I didn't know these Pos variants were there to begin with (I started programming Pascal about a month ago). If I recall corectly, Pos in the documentation doesn't hint to them in 'See also'.

Still, this InStr version has two more features: OrdSelector and IsNumber (if anyone has any use for it).
« Last Edit: April 10, 2015, 02:36:22 pm by Artie »
keep it simple

 

TinyPortal © 2005-2018