Recent

Author Topic: my Helper for UCS4String (UTF-32)  (Read 862 times)

Jorg3000

  • Jr. Member
  • **
  • Posts: 81
my Helper for UCS4String (UTF-32)
« on: December 26, 2023, 04:48:33 pm »
Hi!
In recent years, I have repeatedly noticed in the forum that beginners and old hands sometimes stumble over UTF-8, especially because a char in a string no longer necessarily represents a character, as was previously the case with code pages.

For example, the following does not work with UTF-8 because the umlauts are encoded in 2 bytes (the same applies to Asian characters and special Unicode characters)
Code: Pascal  [Select][+][-]
  1. s[i]:='ö';
  2. if s[i]='ä' then ...

There are special UTF8 routines, but they have to decode the string each time they are called (at least up to the relevant position) and are therefore rather lame.
For direct access to each character, it would be best if you did not have to work in a compressed/encoded string. This problem is supposed to be solved by a UCS4String, but UCS4String is different from a Pascal String and is poorly supported. Most of the usual routines known for AnsiStrings do not work with UCS4String.
That's why I programmed the helper, with which, for example, the following is possible:
Code: Pascal  [Select][+][-]
  1. var s: UCS4String;
  2.     i: SizeInt;  
  3. begin
  4.  s.SetStr('Motor');
  5.  s.Add('heäd');
  6.  for i:=1 to s.Length do
  7.    if s.Chars[i]='ä' then s.Chars[i]='a';
  8.  s.Chars[4]='ö';
  9.  ShowMessage( s.toUtf8 ); // => Motörhead
  10. end;

I have also built methods such as Pos(), Copy(), Insert(), Delete().
The unit is just fresh (see source code or attachment) and briefly tested. Maybe someone can use the unit at some point.
Regards, Jörg

Code: Pascal  [Select][+][-]
  1. unit UCS4Helper;
  2.  
  3. {$mode objfpc}{$H+}{$modeswitch typehelpers}
  4.  
  5.  
  6. //  Unit UCS4Helper by Jorg3000, Version 1.0 from 2023-12-24
  7.  
  8.  
  9. interface
  10.  
  11.  
  12. type
  13.      { TUCS4Helper }
  14.  
  15.      TUCS4Helper = type helper for UCS4String
  16.  
  17.        function  GetLength: SizeInt;
  18.        procedure SetLength(NewLen: SizeInt);
  19.  
  20.        procedure SetStr(const s: String);        overload;
  21.        procedure SetStr(const ws: WideString);     inline;
  22.        procedure SetStr(const ws: UnicodeString);  inline;
  23.  
  24.        procedure Add(const s:  String);      overload;
  25.        procedure Add(const us: UCS4String);
  26.  
  27.        function  CharAsUtf8(i: SizeInt): String;
  28.        procedure SetChar(i: SizeInt; const s: String);
  29.  
  30.        function  toUtf8:  String;
  31.        function  toUtf16: UnicodeString;  inline;
  32.  
  33.        function  Copy(Index, Len: SizeInt): UCS4String;
  34.        function  CopyAsUtf8(Index, Len: SizeInt): String;
  35.  
  36.        procedure Delete(Index, Len: SizeInt);
  37.  
  38.        procedure Insert(const Source: String;     Index: SizeInt);  overload;
  39.        procedure Insert(const Source: UCS4String; Index: SizeInt);
  40.  
  41.        function  Pos(const s:  String;     Offset: SizeInt=1): SizeInt;  overload;
  42.        function  Pos(const us: UCS4String; Offset: SizeInt=1): SizeInt;
  43.  
  44.        property  Length: SizeInt  read GetLength  write SetLength;
  45.  
  46.        property  Chars[i: SizeInt]: String  read CharAsUtf8  write SetChar;  // "default" funzt nicht weil UCS4String = Array
  47.      end;
  48.  
  49.  
  50. implementation
  51.  
  52.  
  53. { TUCS4Helper }
  54.  
  55.  
  56. function TUCS4Helper.GetLength: SizeInt;
  57. begin
  58.   Result:=System.Length(self);
  59.   if Result>0 then dec(Result);
  60. end;
  61.  
  62.  
  63. procedure TUCS4Helper.SetLength(NewLen: SizeInt);
  64. var OldLen: SizeInt;
  65. begin
  66.   if NewLen<=0 then begin self:=nil; Exit; end;
  67.  
  68.   OldLen:=self.GetLength;
  69.   if OldLen=NewLen then Exit;
  70.  
  71.   System.SetLength(self,NewLen+1);
  72.   self[NewLen]:=0;
  73. end;
  74.  
  75.  
  76. procedure TUCS4Helper.SetStr(const s: String);
  77. begin
  78.   if s='' then self:=nil
  79.           else self:=WideStringToUCS4String(WideString(s));
  80. end;
  81.  
  82.  
  83. procedure TUCS4Helper.SetStr(const ws: WideString);
  84. begin
  85.   self:=WideStringToUCS4String(ws);
  86. end;
  87.  
  88.  
  89. procedure TUCS4Helper.SetStr(const ws: UnicodeString);
  90. begin
  91.   self:=UnicodeStringToUCS4String(ws);
  92. end;
  93.  
  94.  
  95. procedure TUCS4Helper.Add(const s: String);
  96. begin
  97.   self.Add( WideStringToUCS4String(WideString(s)) );
  98. end;
  99.  
  100.  
  101. procedure TUCS4Helper.Add(const us: UCS4String);
  102. var Len1, Len2: SizeInt;
  103. begin
  104.   Len2:=System.Length(us)-1;
  105.   if Len2<=0 then Exit;
  106.  
  107.   Len1:=System.Length(self)-1;
  108.   if Len1<=0 then begin self:=us; Exit; end;
  109.  
  110.   System.SetLength(self,Len1+Len2+1);
  111.   Move( us[0], self[Len1], (Len2+1)*SizeOf(UCS4Char) );
  112. end;
  113.  
  114.  
  115. procedure decodeUCS4Char(c4: UCS4Char; out w1, w2: WideChar);  // ähnlich wie in ustrings.inc:UCS4Decode()
  116. begin
  117.   w2:=#0;  // default for 1 UTF16 codepoint only
  118.   if c4<=$ffff then begin w1:=WideChar(Lo(c4)); Exit; end;
  119.   if DWord(c4)>$10ffff then begin w1:='?'; Exit; end;  // invalid codepoint
  120.   w1:=WideChar(c4 shr 10 + $d7c0);
  121.   w2:=WideChar(c4 and $3ff + $dc00);
  122. end;
  123.  
  124.  
  125. function TUCS4Helper.CharAsUtf8(i: SizeInt): String;
  126. var c4: UCS4Char;
  127.     w1, w2: WideChar;
  128.     ws: WideString;
  129. begin
  130.   c4:=self[i-1];
  131.   if c4<128 then begin System.SetLength(Result,1); Result[1]:=Char(c4); Exit; end;  // shortcut for ASCII
  132.  
  133.   decodeUCS4Char(c4,{out}w1,w2);
  134.   if w2=#0 then begin System.SetLength(ws,1); ws[1]:=w1; end
  135.            else begin System.SetLength(ws,2); ws[1]:=w1; ws[2]:=w2; end;
  136.   Result:=UTF8Encode(ws);
  137. end;
  138.  
  139.  
  140. procedure TUCS4Helper.SetChar(i: SizeInt; const s: String);
  141. var us: UCS4String;
  142.     Len2: SizeInt;
  143. begin
  144.   us:=WideStringToUCS4String(WideString(s));
  145.   Len2:=System.Length(us)-1;
  146.   if Len2<=0 then Exit;
  147.   self[i-1]:=us[0];
  148. end;
  149.  
  150.  
  151. function TUCS4Helper.toUtf8: String;
  152. begin
  153.   Result:=UTF8Encode(UCS4StringToWideString(self));
  154. end;
  155.  
  156.  
  157. function TUCS4Helper.toUtf16: UnicodeString;   inline;
  158. begin
  159.   Result:=UCS4StringToUnicodeString(self);
  160. end;
  161.  
  162.  
  163. function TUCS4Helper.Copy(Index, Len: SizeInt): UCS4String;
  164. var Len1, Len2: SizeInt;
  165. begin
  166.   if (Index<=0) or (Len<=0) then Exit;
  167.  
  168.   Len1:=self.GetLength;
  169.   Len2:=Len1+1-Index;  // maximal mögliche Länge ab Index
  170.   if Len2<=0 then Exit;
  171.   if Len<Len2 then Len2:=Len;
  172.  
  173.   System.SetLength(Result,Len2+1);
  174.   Move( self[Index-1], Result[0], Len2*SizeOf(UCS4Char) );
  175.   Result[Len2]:=0;
  176. end;
  177.  
  178.  
  179. function TUCS4Helper.CopyAsUtf8(Index, Len: SizeInt): String;
  180. begin
  181.   Result:=UTF8Encode(UCS4StringToWideString( self.Copy(Index,Len) ));
  182. end;
  183.  
  184.  
  185. procedure TUCS4Helper.Delete(Index, Len: SizeInt);
  186. var Len1, Len2: SizeInt;
  187. begin
  188.   if (Index<=0) or (Len<=0) then Exit;
  189.  
  190.   Len1:=self.GetLength;
  191.   if (Len1<=0) or (Index>Len1) then Exit;
  192.  
  193.   Len2:=Len1+1-Index;  // maximal mögliche Länge ab Index
  194.   if Len2<=0 then Exit;
  195.   if Len<Len2 then Len2:=Len;
  196.  
  197.   System.Delete(self,Index-1,Len2);
  198.   // Move( self[Index+Len2-1], self[Index-1], (Len2+1)*SizeOf(UCS4Char) );
  199.   // System.SetLength(self,Index+Len2);
  200. end;
  201.  
  202.  
  203. procedure TUCS4Helper.Insert(const Source: String; Index: SizeInt);
  204. begin
  205.   self.Insert( WideStringToUCS4String(WideString(Source)), Index);
  206. end;
  207.  
  208.  
  209. procedure TUCS4Helper.Insert(const Source: UCS4String; Index: SizeInt);
  210. var Len2: SizeInt;
  211. begin
  212.   if Index<=0 then Exit;
  213.   Len2:=Source.GetLength;
  214.   if Len2<=0 then Exit;
  215.   System.Insert(Source,{var}self,Index-1);
  216.   System.Delete({var}self,Index-1+Len2,1);  // remove the terminating zero of Source
  217. end;
  218.  
  219.  
  220. function TUCS4Helper.Pos(const s: String; Offset: SizeInt=1): SizeInt;
  221. begin
  222.   Result:=self.Pos( WideStringToUCS4String(WideString(s)), Offset );
  223. end;
  224.  
  225.  
  226. function TUCS4Helper.Pos(const us: UCS4String; Offset: SizeInt=1): SizeInt;
  227. var i, Len1, Len2, Len2Bytes: SizeInt;
  228. begin
  229.   Result:=0;
  230.   Len1:=System.Length(self)-1;
  231.   if Len1<=0 then Exit{0};
  232.  
  233.   Len2:=System.Length(us)-1;
  234.   if Len2<=0 then Exit{0};
  235.  
  236.   if Offset<1 then Offset:=1;
  237.   dec(Offset);
  238.   if Len2>Len1-Offset then Exit{0};
  239.  
  240.   Len2Bytes:=Len2*SizeOf(UCS4Char);
  241.  
  242.   // note "dec(Offset)" see above  =>  e.g. Offset=0
  243.   for i:=Offset to Len1-Len2 do
  244.     if CompareByte(us[0],self[i],Len2Bytes)=0 then Exit(i+1);
  245. end;
  246.  
  247. end.
« Last Edit: December 27, 2023, 06:11:36 am by Jorg3000 »

AlexTP

  • Hero Member
  • *****
  • Posts: 2673
    • UVviewsoft
Re: my Helper for UCS4String (UTF-32)
« Reply #1 on: December 26, 2023, 05:03:09 pm »
Good work!
Please use CODE forum tag, instrad of QUOTE forum tag.

 

TinyPortal © 2005-2018