### Bookstore

 Computer Math and Games in Pascal (preview) Lazarus Handbook

### Author Topic: Regex question  (Read 5145 times)

#### ASerge

• Hero Member
• Posts: 1693
##### Re: Regex question
« Reply #30 on: October 12, 2019, 01:06:24 am »
Thank you, but unfortunately it is not good for me. I need REGEX.
Code: Pascal  [Select][+][-]
1. {\$MODE OBJFPC}
2. {\$APPTYPE CONSOLE}
3. {\$LONGSTRINGS ON}
4.
5. uses RegExpr;
6.
7. procedure Test(const S: string);
8. var
9.   R: TRegExpr;
10. begin
11.   R := TRegExpr.Create('');
12.   try
13.     R.ModifierM := True;
14.     R.Expression := '^\W*(\w+)';
15.     if R.Exec(S) then
16.       repeat
17.         Writeln('"', R.Match[1], '"');
18.       until not R.ExecNext;
19.   finally
20.     R.Free;
21.   end;
22. end;
23.
24. const
25.   CSampleInputText =
26.     '{' + LineEnding +
27.     '   First,//Something text' + LineEnding +
28.     '   Second,' + LineEnding +
29.     '   Small//Something text' + LineEnding +
30.     '   Big' + LineEnding +
31.     '}';
32. begin
33.   Test(CSampleInputText);
35. end.

#### howardpc

• Hero Member
• Posts: 3581
##### Re: Regex question
« Reply #31 on: October 12, 2019, 11:19:09 am »
I don't know in advance how many strings will be there, the 4 is just an example. So, I need a general solution, regex can do it.
A more general (non-regex) solution might be something like the following.
Code: Pascal  [Select][+][-]
1. program ParseExample;
2.
3. {\$AppType console}
4. {\$Mode objfpc}{\$H+}
5.
6.
7. uses Classes, SysUtils;
8.
9. const
10.   txt = '{' + LineEnding +
11.         '   First,//Something text' + LineEnding +
12.         '   Second,' + LineEnding +
13.         '                          extra                   ' + LineEnding +
14.         '   Small//Something text' + LineEnding +
15.         '   Big' + LineEnding + '}';
16.
17. function ParsedCommentToFirstWords(const aTxt: String; list: TStrings): Boolean;
18.
19.     function FirstWord(const s: String): String;
20.     var
21.       p, b: Integer;
22.     begin
23.       p := 1;
24.       while (s[p] in [' ', #9]) and (p < Length(s)) do
25.         Inc(p);
26.       b := p;
27.       while (s[p] in ['a'..'z','A'..'Z']) and (p < Length(s)) do
28.         Inc(p);
29.       case (p = Length(s)) and (s[p] in ['a'..'z','A'..'Z']) of
30.         True:  Result := Copy(s, b, p-b+1);
31.         False: Result := Copy(s, b, p-b);
32.       end;
33.     end;
34.
35. var
36.   i, min: Integer;
37. begin
38.   list.Text := Trim(aTxt);
39.   if (list.Count < 3) or (Trim(list[0]) <> '{') or (Trim(list[list.Count-1]) <> '}') then
40.     Exit(False);
41.   list.Delete(0);
42.   list.Delete(list.Count-1);
43.   min := MaxInt;
44.   for i := 0 to list.Count-1 do
45.     begin
46.       list[i] := FirstWord(list[i]);
47.       if min > Length(list[i]) then
48.         min := Length(list[i]);
49.     end;
50.   Result := min > 0;
51. end;
52.
53. var
54.   sl: TStringList;
55.   s: String;
56. begin
57.   sl := TStringList.Create;
58.   if ParsedCommentToFirstWords(txt, sl) then
59.     for s in sl do
60.       WriteLn(s)
61.   else Writeln('invalid text format for "',txt,'"');
62.   sl.Free;
63. end.

#### justnewbie

• Full Member
• Posts: 228
##### Re: Regex question
« Reply #32 on: October 14, 2019, 10:33:10 pm »
Guys, thank you very much!

#### justnewbie

• Full Member
• Posts: 228
##### Re: Regex question
« Reply #33 on: October 15, 2019, 03:49:45 pm »
Hi ASerge, I wish you were somewhere nearby ...
I have a task that is too complicated for me (again! LOL). I spent hours with it, but ...
I have a text and I need to get all the variable names (highlighted with bold).
What is the proper REGEX pattern to get them? Have you any idea?
Quote
int something;
double anything=2.15;
int nothing  = 7;
string anytext, mytext="",poems;
int   x=7;int z; int w=85;
int k,q,e=2;
« Last Edit: October 15, 2019, 04:32:31 pm by justnewbie »

• Hero Member
• Posts: 10573
##### Re: Regex question
« Reply #34 on: October 15, 2019, 04:02:43 pm »
Regular expressions are powerful, but that looks like something where they are not in place.
It is much simpler to compare to a list of reserved words(int,double etc, not something and family) and parse until white space or control is met
« Last Edit: October 15, 2019, 04:06:03 pm by Thaddy »

#### justnewbie

• Full Member
• Posts: 228
##### Re: Regex question
« Reply #35 on: October 15, 2019, 04:10:36 pm »
Regular expressions are powerful, but that looks like something where they are not in place.
It is much simpler to compare to a list of reserved words(int,double etc, not something and family) and parse until white space or control is met
I don't understand your post. I need to get the variable names (something, anything etc ..) that are highlighted with bold.
« Last Edit: October 15, 2019, 04:13:51 pm by justnewbie »

#### lucamar

• Hero Member
• Posts: 3204
##### Re: Regex question
« Reply #36 on: October 15, 2019, 04:30:39 pm »
Regular expressions are powerful, but that looks like something where they are not in place.
It is much simpler to compare to a list of reserved words(int,double etc, not something and family) and parse until white space or control is met
I don't understand your post. I need to get the variable names (something, anything etc ..) that are highlighted with bold.

It means parsing out the known reserved words, numbers, etc. so that you're left with the "unknown" ones, which are what you're after. For example, parsing the second line: double anything=2.15; you first get "double" which, being a reserved word, you can ignore; then you skip the space(s) and get the word up to the symbol "=": you get "anything" which is not a reserved word but comes after one, so it must be a variable name, which is what you're looking for. Keep going on, skipping the parts in which you're not interested, and you get your list of variables as result.

It might help if you draw a BNF diagram of your lines; that allows you to get a "feel" of what (and how) to parse "in" and what "out".
« Last Edit: October 15, 2019, 04:34:09 pm by lucamar »
Turbo Pascal 3 CP/M - Amstrad PCW 8256 (512 KB !!!)
Lazarus/FPC 2.0.8/3.0.4 & 2.0.10/3.2.0 - 32/64 bits on:
(K|L|X)Ubuntu 12..18, Windows XP, 7, 10 and various DOSes.

#### justnewbie

• Full Member
• Posts: 228
##### Re: Regex question
« Reply #37 on: October 15, 2019, 04:40:04 pm »
Regular expressions are powerful, but that looks like something where they are not in place.
It is much simpler to compare to a list of reserved words(int,double etc, not something and family) and parse until white space or control is met
I don't understand your post. I need to get the variable names (something, anything etc ..) that are highlighted with bold.

It means parsing out the known reserved words, numbers, etc. so that you're left with the "unknown" ones, which are what you're after. For example, parsing the second line: double anything=2.15; you first get "double" which, being a reserved word, you can ignore; then you skip the space(s) and get the word up to the symbol "=": you get "anything" which is not a reserved word but comes after one, so it must be a variable name, which is what you're looking for. Keep going on, skipping the parts in which you're not interested, and you get your list of variables as result.

It might help if you draw a BNF diagram of your lines; that allows you to get a "feel" of what (and how) to parse "in" and what "out".
To be honest, I cannot imagine that there is no a much simpler way.
Think of it: there are hundreds of reserved words and characters (my example is heavily simplified).

This is where I am now: I can get the "orange parts" that contain those names (see picture). The pattern: \b(int|double|string)(\s+)(.+)(  >> the smiley is a ; and )
But, I don't know how could I get ONLY the names.
« Last Edit: October 15, 2019, 04:54:35 pm by justnewbie »

• Hero Member
• Posts: 10573
##### Re: Regex question
« Reply #38 on: October 15, 2019, 04:51:44 pm »
To be honest, I cannot imagine that there is no a much simpler way.
Think of it: there are hundreds of reserved words and characters (my example is heavily simplified).
It is NOT that easy! As Lucamar confirmed.
For such tasks I usually write a compiler...
Don't be afraid, I mean I write a grammar and use plex and pyacc or GoldParser to  generate the basic code.
In your case any C grammar would generate a correct lexer and parser for your problem so you don't even have to write a grammar......

Now that is a "nice" answer to anyone that thinks simple things must be easy to program..

There is a good example: the h2pas sourcecode. (I mean that: you have to do all that)
« Last Edit: October 15, 2019, 04:57:45 pm by Thaddy »

#### justnewbie

• Full Member
• Posts: 228
##### Re: Regex question
« Reply #39 on: October 15, 2019, 07:20:18 pm »
It can be done with 2 or 3 steps, see pictures.
« Last Edit: October 15, 2019, 07:26:45 pm by justnewbie »

#### howardpc

• Hero Member
• Posts: 3581
##### Re: Regex question
« Reply #40 on: October 15, 2019, 07:59:04 pm »
A non-regex solution could be done by extending the list of allowed type names in the following example.
I don't know if digits are allowed in your variable names. If so, you'll need to adjust the parsing routine accordingly.
Code: Pascal  [Select][+][-]
1. program project1;
2.
3. {\$mode objfpc}{\$H+}
4. {\$IfDef Windows}{\$AppType console}{\$EndIf}
5.
6. uses
7.   SysUtils, Types;
8.
9. function IsReserved(const aWord: String): Boolean; // extend this for needed keywords
10. begin
11.   if not Length(aWord) in [3, 6] then
12.     Exit(False);
13.   case LowerCase(aWord) of
14.     'int',
15.     'string',
16.     'double': Exit(True);
17.     else
18.       Exit(False);
19.   end;
20. end;
21.
22. function GetVarNames(aTxt: String): TStringDynArray;
23. var
24.   p: Integer = 1;
25.   index: Integer = 0;
26.   s: String;
27.
28.   function GetNextWord: String; // assumes variable names must be alphabetical
29.   begin
30.     Result := '';
31.     while (p < Length(aTxt)) and not (aTxt[p] in ['A'..'Z','a'..'z']) do
32.       Inc(p);
33.     while (p < Length(aTxt)) and (aTxt[p] in ['A'..'Z','a'..'z']) do
34.       begin
35.         Result := Result + aTxt[p];
36.         Inc(p);
37.       end;
38.   end;
39.
40. begin
41.   SetLength(Result, Length(aTxt) shr 1);
42.   aTxt := Trim(aTxt);
43.   repeat
44.     s := GetNextWord;
45.     case IsReserved(s) of
46.       True: ;
47.       False:
48.         begin
49.           Result[index] := s;
50.           Inc(index);
51.         end;
52.     end;
53.   until s = '';
54.   SetLength(Result, Pred(index));
55. end;
56.
57. var
58.   txt: String = 'int something;'+LineEnding+
59.                 'double anything=2.15;' + LineEnding +
60.                 'int nothing  = 7;' + LineEnding +
61.                 'string anytext, mytext="",poems;' + LineEnding +
62.                 'int   x=7;int z; int w=85;' + LineEnding +
63.                 'int k,q,e=2;';
64.   arr: TStringDynArray;
65.   s: String;
66.
67. begin
68.   arr := GetVarNames(txt);
69.   for s in arr do
70.     WriteLn(s);
71.   WriteLn('Press [Enter] to finish');
73. end.
This outputs
Code: Pascal  [Select][+][-]
1. something
2. anything
3. nothing
4. anytext
5. mytext
6. poems
7. x
8. z
9. w
10. k
11. q
12. e
13. Press [Enter] to finish

• Hero Member
• Posts: 10573
##### Re: Regex question
« Reply #41 on: October 15, 2019, 08:17:50 pm »
That's extending white space...(but it neat code and maybe is a solution)
The only proper solution is a parser.
« Last Edit: October 15, 2019, 08:20:01 pm by Thaddy »

#### justnewbie

• Full Member
• Posts: 228
##### Re: Regex question
« Reply #42 on: October 15, 2019, 08:36:19 pm »
Thank you guys for the contribution!

#### bytebites

• Sr. Member
• Posts: 345
##### Re: Regex question
« Reply #43 on: October 15, 2019, 09:36:31 pm »
Code: Pascal  [Select][+][-]
1. Result := Result + aTxt[p];
This is slow.

#### howardpc

• Hero Member
• Posts: 3581
##### Re: Regex question
« Reply #44 on: October 22, 2019, 04:32:28 pm »
A non-regex solution:
Code: Pascal  [Select][+][-]
1. program project1;
2.
3. {\$mode objfpc}{\$H+}
4. {\$IfDef Windows}{\$AppType console}{\$EndIf}
5.
6. uses
7.   SysUtils;
8.
9. var
10.   txt: String = 'Something (anything nothing="hey!", anything something, nothing="hola!", thing)' +
11.                 'something nothing="aloha!"';
12.   sArr: TStringArray;
13.   s: String;
14.   b, e, i: Integer;
15.
16.   function ExtractedBrackets(const aTxt: String; out Brackets: TStringArray): Boolean;
17.   var
18.     p, pb: Integer;
19.     bCount: Integer = 0;
20.   begin
21.     SetLength({%H-}Brackets, 0);
22.     Result := False;
23.     p := 0;
24.     while p < Length(aTxt) do
25.       begin
26.         Inc(p);
27.         if (aTxt[p] = '(') and (bCount = 0) then
28.           begin
29.             Inc(bCount);
30.             pb := Succ(p);
31.           end;
32.         if (aTxt[p] = ')') and (bCount > 0) then
33.           begin
34.             SetLength(Brackets, Length(Brackets)+1);
35.             Brackets[High(Brackets)] := Copy(aTxt, pb, p-pb);
36.             Dec(bCount);
37.             Result := True;
38.           end;
39.       end;
40.   end;
41.
42. function FoundBetweenPatternAndSeparator(aBegin: Integer; const aTxt, aPattern: String; aSeparators: TSysCharSet; out Fragment: String; out EndPos: Integer): Boolean;
43. var
44.   p, b: Integer;
45. begin
46.   Fragment := '';
47.   Result := False;
48.   EndPos := 0;
49.   p := Pos(aPattern, aTxt, aBegin);
50.   if p > 0 then
51.     begin
52.       Inc(p, Length(aPattern));
53.       b := p;
54.       repeat
55.         Inc(p);
56.       until (p = Length(aTxt)) or (aTxt[p] in aSeparators);
57.       if aTxt[p] in aSeparators then
58.         begin
59.           Fragment := Copy(aTxt, b, p-b);
60.           EndPos := Succ(p);
61.           Exit(True);
62.         end;
63.     end;
64. end;
65.
66. begin
67.   b := 1;
68.   if ExtractedBrackets(txt, sArr) then
69.     for i := 0 to High(sArr) do
70.       while FoundBetweenPatternAndSeparator(b, sArr[i], 'nothing=', [','], s, e) do
71.         begin
72.           WriteLn(s);
73.           b := e;
74.         end;