### Bookstore

 Computer Math and Games in Pascal (preview) Lazarus Handbook

### Author Topic: [SOLVED] Regex Problem  (Read 877 times)

#### superc

• Sr. Member
• Posts: 250
##### [SOLVED] Regex Problem
« on: August 01, 2024, 12:50:28 pm »
Hello,
I must parse a file with fixed space like this:

1A   300011    asdaasd                             510,00    515,00    1,0    0    5
2A   310141    asdasdasdasdasda           700,00    701,00    0,1    0    5
3A   310007    sdfgsdfsdfsdfsdfsdfsdf       240,00    241,00    0,4    0    5
44   310004    sdfgsdfsdfsdfsdfsdfsdf       240,00    241,00    0,4    0    5

I try to use TRegExpr but I haven't been able to get a satisfactory result and I'm wondering if this was the correct way: how would you proceed?
« Last Edit: August 06, 2024, 03:19:55 pm by superc »

#### Martin_fr

• Hero Member
• Posts: 10231
• Debugger - SynEdit - and more
##### Re: Regex Problem
« Reply #1 on: August 01, 2024, 01:09:26 pm »
Assuming that there are no spaces in the data ever. I.e. each sequence of space(s) is a separator.

I would probably write my own parser. Just a loop, maybe using PosEx().

Code: Pascal  [Select][+][-]
1. i := 1;
2. DataFoundCnt := 0;
3. while i < length(line) do begin
4.   // inner loop to consume any spaces
5.   j := i; // first non space
6.   // inner loop to find end of non spaces
7.   Data[DataFndCount] := copy(line, i, j-i);
8.   inc(DataFindCount):
9. end;

But regex is fine too, if you prefer.

I don't know what version of TRegExpr is currently in Fpc, if you have issues, try the latest from https://github.com/andgineer/TRegExpr

#### 440bx

• Hero Member
• Posts: 4472
##### Re: Regex Problem
« Reply #2 on: August 01, 2024, 01:10:33 pm »
from the looks of it, you have fields separated by spaces that makes extracting individual fields quite simple.

if you're doing this in Windows (or even Linux), the function strtok (present in ntdll and "somewhere" in Linux) will do all the necessary parsing for you.  All you need is to implement a loop and save the tokens someplace (likely an array of arrays of char)

Google, "strtok".  if you decide to go that route I'll post the ntdll prototype for you and an example too (I think I have one someplace on my hard drive.)

HTH.

ETA:

Pascal example (full program):

Code: Pascal  [Select][+][-]
1. {\$APPTYPE       CONSOLE}
2.
4.
5. {\$LONGSTRINGS   OFF}
6.
7. { --------------------------------------------------------------------------- }
8.
9.
10. { only one of the defines below should be active.  NOTE: these directives     }
11. { should match, or be consistent with, the directives used to compile the     }
12. { ntdll unit.                                                                 }
13.
14. {\$define USE_POINTERS}
15. //{\$define USE_VAR}
16.
17. { --------------------------------------------------------------------------- }
18.
19.
20. program _strtok_s;
21.   { finds the next token in an input string. returns a pointer to the next    }
22.   { token found in the input string. returns NULL when no more tokens are     }
23.   { found. Each call modifies the input string by substituting a null         }
24.   { character for the first delimiter that occurs after the returned token.   }
25.
26.   { NOTE that unlike the C runtime version, the ntdll version is not able to  }
27.   { report an error which can happen when some of the parameters are null.    }
28.
29.   { documented at :                                                           }
30.   {                                                                           }
31.   { https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/strtok-s-strtok-s-l-wcstok-s-wcstok-s-l-mbstok-s-mbstok-s-l?view=msvc-160 }
32.
33.
34. uses
35.   sysutils
36. ;
37.
38.
39. {  strtok_s                                           ----------------------- }
40.
41. {\$ifdef USE_POINTERS}
42.   function strtok_s
43.            (
44.             { _in_out_ } InoutAsciizString  : pchar;
45.             { _in_     } InAsciizDelimiters : pchar;
46.             { _in_out_ } InoutContext       : ppchar
47.            )
48.          : pchar; cdecl; external ntdll;                           { !! CDECL }
49. {\$endif}
50.
51. {\$ifdef USE_VAR}
52.   function strtok_s
53.              (
54.               { _in_out_ }     InoutAsciizString  : pchar;
55.               { _in_     }     InAsciizDelimiters : pchar;
56.               { _in_out_ } var InoutContext       : pchar
57.              )
58.            : pchar; cdecl; external ntdll;                         { !! CDECL }
59. {\$endif}
60.
61. {\$ifdef USE_OUT}                                      { same as USE_VAR       }
62.   function strtok_s
63.              (
64.               { _in_out_ }     InoutAsciizString  : pchar;
65.               { _in_     }     InAsciizDelimiters : pchar;
66.               { _in_out_ } var InoutContext       : pchar
67.              )
68.            : pchar; cdecl; external ntdll;                         { !! CDECL }
69. {\$endif}
70.
71.   { finds the next token in an input string. returns a pointer to the next    }
72.   { token found in the input string. returns NULL when no more tokens are     }
73.   { found. Each call modifies the input string by substituting a null         }
74.   { character for the first delimiter that occurs after the returned token.   }
75.
76.   { NOTE that unlike the C runtime version, the ntdll version is not able to  }
77.   { report an error which can happen when some of the parameters are null.    }
78.
79.   { documented at :                                                           }
80.   {                                                                           }
81.   { https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/strtok-s-strtok-s-l-wcstok-s-wcstok-s-l-mbstok-s-mbstok-s-l?view=msvc-160 }
82.
83.
85.                   BlockSize   : DWORD);
86.   { displays a memory block in hex                                            }
87.
88. const
89.   SPACE          = \$20;                { ASCII space character                }
90.   HEX_DUMP_WIDTH = 16;                 { number of hex values per line        }
91.
92.   HexDigits      : packed array[0..\$F] of char = '0123456789ABCDEF';
93.
94. var
95.   { variables related to the formatting of hex strings to be displayed        }
96.
97.   Buf            : packed array[0..80] of char;  { one output line            }
98.
99.   HexPtr         : ^char; { pointer to hex area of line buffer                }
100.   CharPtr        : ^char; { pointer to character area of line buffer          }
101.   p              : PBYTE; { used to walk the memory block                     }
102.
103.   i              : DWORD;
104.
105. begin
106.   i := 0;
107.
108.   while i < BlockSize do
109.   begin
110.     RtlFillMemory(@Buf, sizeof(Buf), SPACE);   { space out the buffer         }
111.
112.     { place the offset at the beginning of the line                           }
113.
114.     StrFmt(Buf, '  %8.8x : ', [ptruint(pchar(BaseAddress) + i)]);
115.
116.     { calculate the pointer value to the start of the hex area in buf         }
117.
118.     HexPtr := pointer(pchar(@Buf) + strlen(Buf));
119.
120.     { calculate the pointer value to the start of the character area          }
121.
122.     CharPtr := pointer(pchar(HexPtr) + (HEX_DUMP_WIDTH * 3) + 1);
123.
124.     repeat
125.       pchar(p) := pchar(BaseAddress) + i;          { current byte pointer     }
126.
127.       HexPtr^ := HexDigits[p^ shr 4];              { first nibble             }
128.       inc(HexPtr);
129.
130.       { the typecast byte(\$F) is necessary to avoid a bug in FPC v3.0.4 that  }
131.       { causes it to access a word instead of a byte.                         }
132.
133.       HexPtr^ := HexDigits[p^ and byte(\$F)];       { second nibble            }
134.       inc(HexPtr);
135.
136.       { increment HexPtr again to leave a space between bytes                 }
137.
138.       inc(HexPtr);
139.
140.       { if the byte is a printable character then just place it in the char   }
141.       { area of the buffer, otherwise put a dot instead                       }
142.
143.       CharPtr^ := '.';                              { non printable character }
144.       if p^ in [32..126] then CharPtr^ := char(p^); { printable               }
145.
146.       inc(CharPtr);
147.
148.       { put an extra space between the first and second half of the hex area  }
149.
150.       if i mod HEX_DUMP_WIDTH = (HEX_DUMP_WIDTH div 2) - 1 then inc(HexPtr);
151.
152.       inc(i);
153.     until (i >= BlockSize) or ((i mod HEX_DUMP_WIDTH) = 0);
154.
155.     CharPtr^ := #0;                       { null terminate the buffer         }
156.
157.     writeln(Buf);
158.   end;
159. end;
160.
161.
162. const
163.   AsciizText : pchar = 'A tab character'#9'followed by a newline'#13#10' '    +
164.                        ' followed by these characters and, a comma.';
165.
166.   Delimiters : pchar = ' ,'#9#13#10;
167.
168.
169. var
170.   { strtok_s needs a writeable string.  for that reason we copy the AsciiText }
171.   { into the following array which is writeable.                              }
172.
173.   TextToTokenize : packed array[0..1023] of char;
174.
175.   Token          : pchar;
176.   Context        : pchar;
177.
178. begin
179.   writeln;
180.   writeln;
181.
182.   { move the text to tokenize to a writeable memory block                     }
183.
184.   strcpy(TextToTokenize, AsciizText);
185.
186.   writeln('  string to break into tokens');
187.   writeln;
188.   writeln('  ', TextToTokenize);
189.   writeln;
190.
191.
192.   writeln;
193.   writeln('  the set of delimiters to break the string is (space, comma, '    +
194.              'tab, CRLF)');
195.   writeln;
196.   writeln('  ', Delimiters);       { tabs and CR/LF will _not_ be visible     }
197.   writeln;
198.   HexDump(Delimiters, strlen(Delimiters));
199.   writeln;
200.   writeln;
201.
202.
203.   Token   := nil;             { initialize                                    }
204.   Context := @TextToTokenize[low(TextToTokenize)];
205.
206.   writeln('  the tokens in the string are');
207.   writeln;
208.   writeln;
209.
210.   repeat
211.     {\$ifdef USE_VAR}
212.       Token := strtok_s(Context, Delimiters, Context);
213.     {\$endif}
214.
215.     {\$ifdef USE_POINTERS}
216.       Token := strtok_s(Context, Delimiters, @Context);
217.     {\$endif}
218.
219.     if Token = nil then break;
220.
221.     writeln('  ', Token);
222.   until (Context = nil) or (Context^ = #0);
223.
224.
225.
226.
227.   writeln;
228.   writeln;
231. end.
« Last Edit: August 01, 2024, 01:19:22 pm by 440bx »
(FPC v3.0.4 and Lazarus 1.8.2) or (FPC v3.2.2 and Lazarus v3.2) on Windows 7 SP1 64bit.

#### marcov

• Hero Member
• Posts: 11722
• FPC developer.
##### Re: Regex Problem
« Reply #3 on: August 01, 2024, 01:20:45 pm »
So does the split helper, no need to work with foreign language functions.

The real question is if the fields can contain spaces, as Martin_fr says. Fixed usually means that it can, but then you need a file format definition (which columns are field starts), and you can simply loop over the columns with a FOR and do something along the lines as:

Code: Pascal  [Select][+][-]
1.
2. // columns is an array of first column numbers.
3. for i:=0 to Length(Columns)-1 do
4.   begin
5.      if DataFndCount=length(columns)-1 then
6.          endcolumn:=length(s);
7.      else
8.         endcolumn:=Columns[i+1]-1;
9.    Data[DataFndCount] := copy(line, columns[datefieldcount], endcolumn- columns[datefieldcount]+1);
10. end;
11.

The column positions are sometimes determined from a  header line where the fieldnames are replaced by underscores, and can be parsed using space separation.

#### cdbc

• Hero Member
• Posts: 1496
##### Re: Regex Problem
« Reply #4 on: August 01, 2024, 01:36:23 pm »
Hi
A while back, someone asked a similar question...
I wrote a parser for that, back then, it wouldn't take much to massage it to your needs...
I've attached the zip, in the hope it can be of use to you...
Regards Benny
If it ain't broke, don't fix it
PCLinuxOS(rolling release) 64bit -> KDE5 -> FPC 3.2.2 -> Lazarus 2.2.6 up until Jan 2024 from then on it's: KDE5/QT5 -> FPC 3.3.1 -> Lazarus 3.0

#### Dzandaa

• Sr. Member
• Posts: 349
• From C# to Lazarus
##### Re: Regex Problem
« Reply #5 on: August 01, 2024, 02:26:34 pm »
Hi,

You can also try this (if you don't have spaces in the String parameter!!!):

Code: Pascal  [Select][+][-]
1.  BTest: TButton;
2.  OpenDialog: TOpenDialog;
3.
4. procedure TForm1.BTestClick(Sender: TObject);
5. Var
6.  InputFile: text;
7.  Str: String;
8.  StrArr: TStringArray;
9. begin
10.  StrArr := nil;
11.  if(OpenDialog.Execute) then
12.  begin
13.   AssignFile(InputFile, OpenDialog.FileName);
14.   Reset(InputFile);
15.   while not Eof(InputFile) do
16.   begin
18.    StrArr := Str.Split([' '],  TStringSplitOptions.ExcludeEmpty);
19.   end;
20.   CloseFile(InputFile);
21.  end;
22. end;
23.
24.

StrArr is a String array containing the different fields.

B->
« Last Edit: August 01, 2024, 02:32:54 pm by Dzandaa »
Regards,
Dzandaa

#### MarkMLl

• Hero Member
• Posts: 7441
##### Re: Regex Problem
« Reply #6 on: August 01, 2024, 04:16:59 pm »
I'd go "old school". If the positions of the fields really are fixed I'd use Copy() and Trim(); if they aren't and the spaces really are used as delimiters I'd replace the existing delimiters (i.e. individual or runs of spaces) with e.g. \$7f and then split on those.

MarkMLl
MT+86 & Turbo Pascal v1 on CCP/M-86, multitasking with LAN & graphics in 128Kb.
Logitech, TopSpeed & FTL Modula-2 on bare metal (Z80, '286 protected mode).
Pet hate: people who boast about the size and sophistication of their computer.
GitHub repositories: https://github.com/MarkMLl?tab=repositories

#### Ally

• Jr. Member
• Posts: 55
##### Re: Regex Problem
« Reply #7 on: August 01, 2024, 04:19:44 pm »
Hello,

here is another suggestion.

Code: Pascal  [Select][+][-]
1. uses
2.   StrUtils;
3.
4. procedure TForm1.Button1Click(Sender: TObject);
5. var
6.   S: String;
7. begin
8.   S := '2A   310141    asdasdasdasdasda           700,00    701,00    0,1    0    5';
9.
10.   Label1.Caption := Copy2SymbDel(S, ' ');
11.   S := TrimLeft(S);
12.   Label2.Caption := Copy2SymbDel(S, ' ');
13.   S := TrimLeft(S);
14.   Label3.Caption := Copy2SymbDel(S, ' ');
15.   S := TrimLeft(S);
16.   Label4.Caption := Copy2SymbDel(S, ' ');
17.   S := TrimLeft(S);
18.   Label5.Caption := Copy2SymbDel(S, ' ');
19.   S := TrimLeft(S);
20.   Label6.Caption := Copy2SymbDel(S, ' ');
21.   S := TrimLeft(S);
22.   Label7.Caption := Copy2SymbDel(S, ' ');
23.   S := TrimLeft(S);
24.   Label8.Caption := Copy2SymbDel(S, ' ');
25. end;
« Last Edit: August 01, 2024, 04:22:47 pm by Ally »

#### Roland57

• Sr. Member
• Posts: 450
##### Re: Regex Problem
« Reply #8 on: August 01, 2024, 09:05:10 pm »
My proposition.

Code: Pascal  [Select][+][-]
1. uses
2.   SysUtils, RegExpr;
3.
4. const
5.   SAMPLE: array[1..4] of string = (
6.     '1A   300011    asdaasd                    510,00    515,00    1,0    0    5',
7.     '2A   310141    asdasdasdasdasda           700,00    701,00    0,1    0    5',
8.     '3A   310007    sdfgsdfsdfsdfsdfsdfsdf     240,00    241,00    0,4    0    5',
9.     '44   310004    sdfgsdfsdfsdfsdfsdfsdf     240,00    241,00    0,4    0    5'
10.   );
11.
12.   EXPR =
13.     '(\w{2})'       + '\s+' +
14.     '(\d{6})'       + '\s+' +
15.     '([a-z]+)'      + '\s+' +
16.     '(\d{3},\d{2})' + '\s+' +
17.     '(\d{3},\d{2})' + '\s+' +
18.     '(\d,\d)'       + '\s+' +
19.     '(\d)'          + '\s+' +
20.     '(\d)';
21.
22. var
23.   i, j: integer;
24.
25. begin
26.   with TRegExpr.Create(EXPR) do
27.   try
28.     for i := Low(SAMPLE) to High(SAMPLE) do
29.       if Exec(SAMPLE[i]) then
30.       begin
31.         for j := 1 to 8 do
32.           Write('|', Match[j]);
33.         WriteLn('|');
34.       end;
35.   finally
36.     Free;
37.   end;
38. end.
39.

\$ ./ex1
|1A|300011|asdaasd|510,00|515,00|1,0|0|5|
|2A|310141|asdasdasdasdasda|700,00|701,00|0,1|0|5|
|3A|310007|sdfgsdfsdfsdfsdfsdfsdf|240,00|241,00|0,4|0|5|
|44|310004|sdfgsdfsdfsdfsdfsdfsdf|240,00|241,00|0,4|0|5|
\$

Or, using only one string:

Code: Pascal  [Select][+][-]
1. var
2.   j: integer;
3.
4. begin
5.   with TRegExpr.Create(EXPR) do
6.   try
7.     if Exec(SAMPLE) then
8.     repeat
9.       for j := 1 to 8 do
10.         Write('|', Match[j]);
11.       WriteLn('|');
12.     until not ExecNext;
13.   finally
14.     Free;
15.   end;
16. end.

Full example attached.
« Last Edit: August 02, 2024, 08:24:24 am by Roland57 »
My projects are on Gitlab and on Codeberg.

#### paweld

• Hero Member
• Posts: 1180
##### Re: Regex Problem
« Reply #9 on: August 02, 2024, 04:38:47 pm »
or use FixedFormatDataSet:
Code: Pascal  [Select][+][-]
1. unit Unit1;
2.
3. {\$mode objfpc}{\$H+}
4.
5. interface
6.
7. uses
8.   Classes, SysUtils, SdfData, DB, Forms, Controls, Graphics, Dialogs, DBGrids;
9.
10. type
11.
12.   { TForm1 }
13.
14.   TForm1 = class(TForm)
15.     DataSource1: TDataSource;
16.     DBGrid1: TDBGrid;
17.     FixedFormatDataSet2: TFixedFormatDataSet;
18.     procedure FormCreate(Sender: TObject);
19.   private
20.
21.   public
22.
23.   end;
24.
25. var
26.   Form1: TForm1;
27.
28. implementation
29.
30. {\$R *.lfm}
31.
32. { TForm1 }
33.
34. const
35.   data: String =
36.   // ----5--------10---------------------------29--------10--------10------7----5-1
37.     '1A   300011    asdaasd                      510,00    515,00    1,0    0    5' + #13#10 +
38.     '2A   310141    asdasdasdasdasda             700,00    701,00    0,1    0    5' + #13#10 +
39.     '3A   310007    sdfgsdfsdfsdfsdfsdfsdf       240,00    241,00    0,4    0    5' + #13#10 +
40.     '44   310004    sdfgsdfsdfsdfsdfsdfsdf       240,00    241,00    0,4    0    5';
41.
42. var
43.   ffds: TFixedFormatDataSet;
44.
45. procedure TForm1.FormCreate(Sender: TObject);
46. var
47.   ss: TStringStream;
48. begin
49.   ffds := TFixedFormatDataSet.Create(Self);
50.   ffds.CodePage := 'windows-1252';
51.   ffds.TrimSpace := True;
52.   //set schema
62.   ss := TStringStream.Create(data);
64.   ss.Free;
65.   DataSource1.DataSet := ffds;
66.   ffds.Open;
67. end;
68.
69. end.
70.
Best regards / Pozdrawiam
paweld

#### superc

• Sr. Member
• Posts: 250
##### Re: Regex Problem
« Reply #10 on: August 06, 2024, 03:19:39 pm »
Thanks, these are all pieces of code that will be useful to me, these are problems that recur frequently....

Hi,

You can also try this (if you don't have spaces in the String parameter!!!):

Code: Pascal  [Select][+][-]
1.  BTest: TButton;
2.  OpenDialog: TOpenDialog;
3.
4. procedure TForm1.BTestClick(Sender: TObject);
5. Var
6.  InputFile: text;
7.  Str: String;
8.  StrArr: TStringArray;
9. begin
10.  StrArr := nil;
11.  if(OpenDialog.Execute) then
12.  begin
13.   AssignFile(InputFile, OpenDialog.FileName);
14.   Reset(InputFile);
15.   while not Eof(InputFile) do
16.   begin
18.    StrArr := Str.Split([' '],  TStringSplitOptions.ExcludeEmpty);
19.   end;
20.   CloseFile(InputFile);
21.  end;
22. end;
23.
24.

StrArr is a String array containing the different fields.

B->

I preferred this solution because it was an easy solution for my problem, thanks again.