Recent

Author Topic: Regex question  (Read 3109 times)

ASerge

  • Hero Member
  • *****
  • Posts: 1411
Re: Regex question
« Reply #30 on: October 12, 2019, 01:06:24 am »
Thank you, but unfortunately it is not good for me. I need REGEX.
Code: Pascal  [Select]
  1. {$MODE OBJFPC}
  2. {$APPTYPE CONSOLE}
  3. {$LONGSTRINGS ON}
  4.  
  5. uses RegExpr;
  6.  
  7. procedure Test(const S: string);
  8. var
  9.   R: TRegExpr;
  10. begin
  11.   R := TRegExpr.Create('');
  12.   try
  13.     R.ModifierM := True;
  14.     R.Expression := '^\W*(\w+)';
  15.     if R.Exec(S) then
  16.       repeat
  17.         Writeln('"', R.Match[1], '"');
  18.       until not R.ExecNext;
  19.   finally
  20.     R.Free;
  21.   end;
  22. end;
  23.  
  24. const
  25.   CSampleInputText =
  26.     '{' + LineEnding +
  27.     '   First,//Something text' + LineEnding +
  28.     '   Second,' + LineEnding +
  29.     '   Small//Something text' + LineEnding +
  30.     '   Big' + LineEnding +
  31.     '}';
  32. begin
  33.   Test(CSampleInputText);
  34.   Readln;
  35. end.

howardpc

  • Hero Member
  • *****
  • Posts: 3178
Re: Regex question
« Reply #31 on: October 12, 2019, 11:19:09 am »
I don't know in advance how many strings will be there, the 4 is just an example. So, I need a general solution, regex can do it.
A more general (non-regex) solution might be something like the following.
Code: Pascal  [Select]
  1. program ParseExample;
  2.  
  3. {$AppType console}
  4. {$Mode objfpc}{$H+}      
  5.  
  6.  
  7. uses Classes, SysUtils;
  8.  
  9. const
  10.   txt = '{' + LineEnding +
  11.         '   First,//Something text' + LineEnding +
  12.         '   Second,' + LineEnding +
  13.         '                          extra                   ' + LineEnding +
  14.         '   Small//Something text' + LineEnding +
  15.         '   Big' + LineEnding + '}';
  16.  
  17. function ParsedCommentToFirstWords(const aTxt: String; list: TStrings): Boolean;
  18.  
  19.     function FirstWord(const s: String): String;
  20.     var
  21.       p, b: Integer;
  22.     begin
  23.       p := 1;
  24.       while (s[p] in [' ', #9]) and (p < Length(s)) do
  25.         Inc(p);
  26.       b := p;
  27.       while (s[p] in ['a'..'z','A'..'Z']) and (p < Length(s)) do
  28.         Inc(p);
  29.       case (p = Length(s)) and (s[p] in ['a'..'z','A'..'Z']) of
  30.         True:  Result := Copy(s, b, p-b+1);
  31.         False: Result := Copy(s, b, p-b);
  32.       end;
  33.     end;
  34.  
  35. var
  36.   i, min: Integer;
  37. begin
  38.   list.Text := Trim(aTxt);
  39.   if (list.Count < 3) or (Trim(list[0]) <> '{') or (Trim(list[list.Count-1]) <> '}') then
  40.     Exit(False);
  41.   list.Delete(0);
  42.   list.Delete(list.Count-1);
  43.   min := MaxInt;
  44.   for i := 0 to list.Count-1 do
  45.     begin
  46.       list[i] := FirstWord(list[i]);
  47.       if min > Length(list[i]) then
  48.         min := Length(list[i]);
  49.     end;
  50.   Result := min > 0;
  51. end;
  52.  
  53. var
  54.   sl: TStringList;
  55.   s: String;
  56. begin
  57.   sl := TStringList.Create;
  58.   if ParsedCommentToFirstWords(txt, sl) then
  59.     for s in sl do
  60.       WriteLn(s)
  61.   else Writeln('invalid text format for "',txt,'"');
  62.   sl.Free;
  63. end.

justnewbie

  • Full Member
  • ***
  • Posts: 225
Re: Regex question
« Reply #32 on: October 14, 2019, 10:33:10 pm »
Guys, thank you very much!

justnewbie

  • Full Member
  • ***
  • Posts: 225
Re: Regex question
« Reply #33 on: October 15, 2019, 03:49:45 pm »
Hi ASerge, I wish you were somewhere nearby ...  :)
I have a task that is too complicated for me (again! LOL). I spent hours with it, but ...
I have a text and I need to get all the variable names (highlighted with bold).
What is the proper REGEX pattern to get them? Have you any idea?
Quote
int something;
double anything=2.15;
int nothing  = 7;
string anytext, mytext="",poems;
int   x=7;int z; int w=85;
int k,q,e=2;
« Last Edit: October 15, 2019, 04:32:31 pm by justnewbie »

Thaddy

  • Hero Member
  • *****
  • Posts: 9183
Re: Regex question
« Reply #34 on: October 15, 2019, 04:02:43 pm »
Regular expressions are powerful, but that looks like something where they are not in place.
It is much simpler to compare to a list of reserved words(int,double etc, not something and family) and parse until white space or control is met
« Last Edit: October 15, 2019, 04:06:03 pm by Thaddy »
also related to equus asinus.

justnewbie

  • Full Member
  • ***
  • Posts: 225
Re: Regex question
« Reply #35 on: October 15, 2019, 04:10:36 pm »
Regular expressions are powerful, but that looks like something where they are not in place.
It is much simpler to compare to a list of reserved words(int,double etc, not something and family) and parse until white space or control is met
I don't understand your post. I need to get the variable names (something, anything etc ..) that are highlighted with bold.
« Last Edit: October 15, 2019, 04:13:51 pm by justnewbie »

lucamar

  • Hero Member
  • *****
  • Posts: 2081
Re: Regex question
« Reply #36 on: October 15, 2019, 04:30:39 pm »
Regular expressions are powerful, but that looks like something where they are not in place.
It is much simpler to compare to a list of reserved words(int,double etc, not something and family) and parse until white space or control is met
I don't understand your post. I need to get the variable names (something, anything etc ..) that are highlighted with bold.

It means parsing out the known reserved words, numbers, etc. so that you're left with the "unknown" ones, which are what you're after. For example, parsing the second line: double anything=2.15; you first get "double" which, being a reserved word, you can ignore; then you skip the space(s) and get the word up to the symbol "=": you get "anything" which is not a reserved word but comes after one, so it must be a variable name, which is what you're looking for. Keep going on, skipping the parts in which you're not interested, and you get your list of variables as result.

It might help if you draw a BNF diagram of your lines; that allows you to get a "feel" of what (and how) to parse "in" and what "out".
« Last Edit: October 15, 2019, 04:34:09 pm by lucamar »
Turbo Pascal 3 CP/M - Amstrad PCW 8256 (512 KB !!!) :P
Lazarus 2.0.2/2.0.4  - FPC 3.0.4 on:
(K|L)Ubuntu 12..16, Windows XP SP3, various DOSes.

justnewbie

  • Full Member
  • ***
  • Posts: 225
Re: Regex question
« Reply #37 on: October 15, 2019, 04:40:04 pm »
Regular expressions are powerful, but that looks like something where they are not in place.
It is much simpler to compare to a list of reserved words(int,double etc, not something and family) and parse until white space or control is met
I don't understand your post. I need to get the variable names (something, anything etc ..) that are highlighted with bold.

It means parsing out the known reserved words, numbers, etc. so that you're left with the "unknown" ones, which are what you're after. For example, parsing the second line: double anything=2.15; you first get "double" which, being a reserved word, you can ignore; then you skip the space(s) and get the word up to the symbol "=": you get "anything" which is not a reserved word but comes after one, so it must be a variable name, which is what you're looking for. Keep going on, skipping the parts in which you're not interested, and you get your list of variables as result.

It might help if you draw a BNF diagram of your lines; that allows you to get a "feel" of what (and how) to parse "in" and what "out".
To be honest, I cannot imagine that there is no a much simpler way.
Think of it: there are hundreds of reserved words and characters (my example is heavily simplified).

This is where I am now: I can get the "orange parts" that contain those names (see picture). The pattern: \b(int|double|string)(\s+)(.+)(;)  >> the smiley is a ; and )
But, I don't know how could I get ONLY the names.
« Last Edit: October 15, 2019, 04:54:35 pm by justnewbie »

Thaddy

  • Hero Member
  • *****
  • Posts: 9183
Re: Regex question
« Reply #38 on: October 15, 2019, 04:51:44 pm »
To be honest, I cannot imagine that there is no a much simpler way.
Think of it: there are hundreds of reserved words and characters (my example is heavily simplified).
It is NOT that easy! As Lucamar confirmed.
For such tasks I usually write a compiler...
Don't be afraid, I mean I write a grammar and use plex and pyacc or GoldParser to  generate the basic code.
In your case any C grammar would generate a correct lexer and parser for your problem so you don't even have to write a grammar......

Now that is a "nice" answer to anyone that thinks simple things must be easy to program.. O:-)

There is a good example: the h2pas sourcecode. (I mean that: you have to do all that)
« Last Edit: October 15, 2019, 04:57:45 pm by Thaddy »
also related to equus asinus.

justnewbie

  • Full Member
  • ***
  • Posts: 225
Re: Regex question
« Reply #39 on: October 15, 2019, 07:20:18 pm »
It can be done with 2 or 3 steps, see pictures. :)
« Last Edit: October 15, 2019, 07:26:45 pm by justnewbie »

howardpc

  • Hero Member
  • *****
  • Posts: 3178
Re: Regex question
« Reply #40 on: October 15, 2019, 07:59:04 pm »
A non-regex solution could be done by extending the list of allowed type names in the following example.
I don't know if digits are allowed in your variable names. If so, you'll need to adjust the parsing routine accordingly.
Code: Pascal  [Select]
  1. program project1;
  2.  
  3. {$mode objfpc}{$H+}
  4. {$IfDef Windows}{$AppType console}{$EndIf}
  5.  
  6. uses
  7.   SysUtils, Types;
  8.  
  9. function IsReserved(const aWord: String): Boolean; // extend this for needed keywords
  10. begin
  11.   if not Length(aWord) in [3, 6] then
  12.     Exit(False);
  13.   case LowerCase(aWord) of
  14.     'int',
  15.     'string',
  16.     'double': Exit(True);
  17.     else
  18.       Exit(False);
  19.   end;
  20. end;
  21.  
  22. function GetVarNames(aTxt: String): TStringDynArray;
  23. var
  24.   p: Integer = 1;
  25.   index: Integer = 0;
  26.   s: String;
  27.  
  28.   function GetNextWord: String; // assumes variable names must be alphabetical
  29.   begin
  30.     Result := '';
  31.     while (p < Length(aTxt)) and not (aTxt[p] in ['A'..'Z','a'..'z']) do
  32.       Inc(p);
  33.     while (p < Length(aTxt)) and (aTxt[p] in ['A'..'Z','a'..'z']) do
  34.       begin
  35.         Result := Result + aTxt[p];
  36.         Inc(p);
  37.       end;
  38.   end;
  39.  
  40. begin
  41.   SetLength(Result, Length(aTxt) shr 1);
  42.   aTxt := Trim(aTxt);
  43.   repeat
  44.     s := GetNextWord;
  45.     case IsReserved(s) of
  46.       True: ;
  47.       False:
  48.         begin
  49.           Result[index] := s;
  50.           Inc(index);
  51.         end;
  52.     end;
  53.   until s = '';
  54.   SetLength(Result, Pred(index));
  55. end;
  56.  
  57. var
  58.   txt: String = 'int something;'+LineEnding+
  59.                 'double anything=2.15;' + LineEnding +
  60.                 'int nothing  = 7;' + LineEnding +
  61.                 'string anytext, mytext="",poems;' + LineEnding +
  62.                 'int   x=7;int z; int w=85;' + LineEnding +
  63.                 'int k,q,e=2;';
  64.   arr: TStringDynArray;
  65.   s: String;
  66.  
  67. begin
  68.   arr := GetVarNames(txt);
  69.   for s in arr do
  70.     WriteLn(s);
  71.   WriteLn('Press [Enter] to finish');
  72.   ReadLn;
  73. end.
This outputs
Code: Pascal  [Select]
  1. something
  2. anything
  3. nothing
  4. anytext
  5. mytext
  6. poems
  7. x
  8. z
  9. w
  10. k
  11. q
  12. e
  13. Press [Enter] to finish

Thaddy

  • Hero Member
  • *****
  • Posts: 9183
Re: Regex question
« Reply #41 on: October 15, 2019, 08:17:50 pm »
That's extending white space...(but it neat code and maybe is a solution)
The only proper solution is a parser.
« Last Edit: October 15, 2019, 08:20:01 pm by Thaddy »
also related to equus asinus.

justnewbie

  • Full Member
  • ***
  • Posts: 225
Re: Regex question
« Reply #42 on: October 15, 2019, 08:36:19 pm »
Thank you guys for the contribution!

bytebites

  • Full Member
  • ***
  • Posts: 213
Re: Regex question
« Reply #43 on: October 15, 2019, 09:36:31 pm »
Code: Pascal  [Select]
  1. Result := Result + aTxt[p];
This is slow.

howardpc

  • Hero Member
  • *****
  • Posts: 3178
Re: Regex question
« Reply #44 on: October 22, 2019, 04:32:28 pm »
A non-regex solution:
Code: Pascal  [Select]
  1. program project1;
  2.  
  3. {$mode objfpc}{$H+}
  4. {$IfDef Windows}{$AppType console}{$EndIf}
  5.  
  6. uses
  7.   SysUtils;
  8.  
  9. var
  10.   txt: String = 'Something (anything nothing="hey!", anything something, nothing="hola!", thing)' +
  11.                 'something nothing="aloha!"';
  12.   sArr: TStringArray;
  13.   s: String;
  14.   b, e, i: Integer;
  15.  
  16.   function ExtractedBrackets(const aTxt: String; out Brackets: TStringArray): Boolean;
  17.   var
  18.     p, pb: Integer;
  19.     bCount: Integer = 0;
  20.   begin
  21.     SetLength({%H-}Brackets, 0);
  22.     Result := False;
  23.     p := 0;
  24.     while p < Length(aTxt) do
  25.       begin
  26.         Inc(p);
  27.         if (aTxt[p] = '(') and (bCount = 0) then
  28.           begin
  29.             Inc(bCount);
  30.             pb := Succ(p);
  31.           end;
  32.         if (aTxt[p] = ')') and (bCount > 0) then
  33.           begin
  34.             SetLength(Brackets, Length(Brackets)+1);
  35.             Brackets[High(Brackets)] := Copy(aTxt, pb, p-pb);
  36.             Dec(bCount);
  37.             Result := True;
  38.           end;
  39.       end;
  40.   end;
  41.  
  42. function FoundBetweenPatternAndSeparator(aBegin: Integer; const aTxt, aPattern: String; aSeparators: TSysCharSet; out Fragment: String; out EndPos: Integer): Boolean;
  43. var
  44.   p, b: Integer;
  45. begin
  46.   Fragment := '';
  47.   Result := False;
  48.   EndPos := 0;
  49.   p := Pos(aPattern, aTxt, aBegin);
  50.   if p > 0 then
  51.     begin
  52.       Inc(p, Length(aPattern));
  53.       b := p;
  54.       repeat
  55.         Inc(p);
  56.       until (p = Length(aTxt)) or (aTxt[p] in aSeparators);
  57.       if aTxt[p] in aSeparators then
  58.         begin
  59.           Fragment := Copy(aTxt, b, p-b);
  60.           EndPos := Succ(p);
  61.           Exit(True);
  62.         end;
  63.     end;
  64. end;
  65.  
  66. begin
  67.   b := 1;
  68.   if ExtractedBrackets(txt, sArr) then
  69.     for i := 0 to High(sArr) do
  70.       while FoundBetweenPatternAndSeparator(b, sArr[i], 'nothing=', [','], s, e) do
  71.         begin
  72.           WriteLn(s);
  73.           b := e;
  74.         end;
  75.   ReadLn;
  76. end.