Recent

Author Topic: WideChar / UnicodeString type helper  (Read 2183 times)

codewar65

  • New Member
  • *
  • Posts: 13
WideChar / UnicodeString type helper
« on: August 29, 2017, 06:16:56 am »
I had a specific need for this and thought I'd share. I've been working with various encoding types in the last few projects and needed to be able to convert bytes to UTF8 / UTF16 and handling 8bit codepaged encoding/decoding. With all the existing units to handle this type of stuff i was getting confused on what string type to use and which package / unit had the functions I needed.

I can now use all UnicodeString / WideChar internally and convert to/from everything else on a as-need basis using these helpers.

Please let me know if you notice any glaring mistakes.
Thanks / enjoy.

*modified for minor bug catches.

Code: Pascal  [Select][+][-]
  1. unit UnicodeHelper;
  2.  
  3. {$mode objfpc}{$H+}
  4. {$modeswitch typehelpers}
  5.  
  6. interface
  7.  
  8. uses
  9.   SysUtils;
  10.  
  11. type
  12.  
  13.   TUnicodeStringArray = array of UnicodeString;
  14.  
  15.   TWords = array of Word;
  16.  
  17.   TWideChars = array of WideChar;
  18.  
  19.   TWideCharHelper = type helper for WideChar
  20.     public
  21.       function getUTF8Length : integer;   // from 1 to 3 (4=not yet).
  22.       function getUTF16Length : integer;  // always 2
  23.       function getCPLength : integer;     // always 1
  24.       function toCharCode : integer;
  25.       function fromCharCode(chr : integer) : WideChar;
  26.   end;
  27.  
  28.   TUnicodeStringHelper = type helper for UnicodeString
  29.     public
  30.       function length : integer; overload;
  31.       function substring(index : Integer): unicodestring; overload;
  32.       function substring(index : Integer; len : Integer): unicodestring; overload;
  33.       function charCodeAt(index : integer) : integer;
  34.       function charAt(index : integer) : WideChar;
  35.       function split(const Separators: array of WideChar): TUnicodeStringArray; overload;
  36.       // other String Helper type functions can be added as required.
  37.  
  38.       function toWideCharArray : TWideChars;
  39.       function toWordArray : TWords;
  40.  
  41.       function toUTF8Bytes : TBytes;
  42.       function toUTF16Bytes : TBytes;
  43.       function toCPBytes : TBytes;
  44.  
  45.       function getUTF8BytesLength : integer;                  // varies
  46.       function getUTF16BytesLength : integer;                 // length * 2
  47.       function getCPBytesLength : integer;                    // length
  48.  
  49.       function fromUTF8Bytes(bytes : TBytes) : UnicodeString;
  50.       function fromUTF16Bytes(bytes : TBytes) : UnicodeString;
  51.       function fromCPBytes(bytes : TBytes) : UnicodeString;
  52.  
  53.       function hasUTF8BrokenBytes(bytes : TBytes) : boolean;
  54.       function hasUTF16BrokenBytes(bytes : TBytes) : boolean;
  55.       function hasCPBrokenBytes(bytes : TBytes) : boolean;    // always false
  56.  
  57.       function getUTF8BrokenBytes(bytes : TBytes) : TBytes;
  58.       function getUTF16BrokenBytes(bytes : TBytes) : TBytes;
  59.       function getCPBrokenBytes(bytes : TBytes) : TBytes;     // always returns []
  60.  
  61.       procedure mapCP(map : TWideChars);
  62.   end;
  63.  
  64.  
  65. implementation
  66.  
  67. { TWideCHarHelper }
  68.  
  69. function TWideCharHelper.getUTF8Length : integer;
  70. begin
  71.   if integer(self) < $80 then           result := 1
  72.   else if integer(self)  < $800 then    result := 2
  73.   else if integer(self)  < $10000 then  result := 3
  74.   else                                  result := 4;
  75. end;
  76.  
  77. function TWideCharHelper.getUTF16Length : integer; inline;
  78. begin
  79.   result := 2;
  80. end;
  81.  
  82. function TWideCharHelper.getCPLength : integer; inline;
  83. begin
  84.   result := 1;
  85. end;
  86.  
  87. function TWideCharHelper.toCharCode : integer; inline;
  88. begin
  89.   result := integer(self);
  90. end;
  91.  
  92. function TWideCharHelper.fromCharCode(chr : integer) : WideChar; inline;
  93. begin
  94.   result := WideChar(chr);
  95. end;
  96.  
  97. { TUnicodeStringHelper }
  98.  
  99. {
  100.   length : length of the UnicodeString in WideChars.
  101. }
  102. function TUnicodeStringHelper.length : integer; inline;
  103. begin
  104.   result := system.length(self);
  105. end;
  106.  
  107. function TUnicodeStringHelper.substring(index : Integer): unicodestring;
  108. var
  109.   strlen, len : integer;
  110. begin
  111.   strlen := self.length;
  112.   if (index < 0) or (index >= strlen) then
  113.     result := ''
  114.   else
  115.   begin
  116.     len := strlen - index;
  117.     setlength(result, len);
  118.     move(self[1 + index], result[1], len * sizeof(WideChar));
  119.   end;
  120. end;
  121.  
  122. function TUnicodeStringHelper.substring(index : Integer; len : Integer): unicodestring;
  123. var
  124.   strlen : integer;
  125. begin
  126.   strlen := self.length;
  127.   if (index < 0) or (index >= strlen) or (len <= 0) then
  128.     result := ''
  129.   else
  130.   begin
  131.     if index + len > strlen then
  132.       len := strlen - index;
  133.     setlength(result, len);
  134.     move(self[1 + index], result[1], len * sizeof(WideChar));
  135.   end;
  136. end;
  137.  
  138. function TUnicodeStringHelper.charCodeAt(index : integer) : integer; inline;
  139. begin
  140.   if (index < 0) or (index >= self.length) then
  141.     result := 0
  142.   else
  143.     result := self[index + 1].toCharCode;
  144. end;
  145.  
  146. function TUnicodeStringHelper.charAt(index : integer) : WideChar; inline;
  147. begin
  148.   if (index < 0) or (index >= self.length) then
  149.     result := WideChar(0)
  150.   else
  151.     result := self[index + 1];
  152. end;
  153.  
  154. Function TUnicodeStringHelper.split(const Separators: array of WideChar): TUnicodeStringArray;
  155. var
  156.   i, j, lastpos : integer;
  157.   ch : widechar;
  158.  
  159.   x : UnicodeString;
  160. begin
  161.   x := self;
  162.   setlength(result, 0);
  163.   lastpos := 0;
  164.   for i := 0 to self.length - 1 do
  165.   begin
  166.     ch := self.charAt(i);
  167.     for j := 0 to system.length(Separators) - 1 do
  168.     begin
  169.       if ch = Separators[j] then
  170.       begin
  171.         setlength(result, system.length(result) + 1);
  172.         result[system.length(result) - 1] := self.substring(lastpos, i - lastpos);
  173.         lastpos := i + 1;
  174.         break;
  175.       end;
  176.     end;
  177.   end;
  178.   setlength(result, system.length(result) + 1);
  179.   result[system.length(result) - 1] := self.substring(lastpos);
  180. end;
  181.  
  182. function TUnicodeStringHelper.toWideCharArray : TWideChars;
  183. begin
  184.   setlength(result, self.length);
  185.   move(self[1], result[0], sizeof(WideChar));
  186. end;
  187.  
  188. function TUnicodeStringHelper.toWordArray : TWords;
  189. var
  190.   len : longint;
  191. begin
  192.   len := self.length;
  193.   setlength(result, len);
  194.   move(self[1], result[0], sizeof(WideChar) * len);
  195. end;
  196.  
  197. {
  198.   toUTF8Bytes : returns array of bytes encoded in UTF8.
  199. }
  200. function TUnicodeStringHelper.toUTF8Bytes : TBytes;
  201. var
  202.   cv, i, len, cl :  integer;
  203.   p :               pbyte;
  204.   cw :              WideChar;
  205. begin
  206.   len := self.getUTF8BytesLength;
  207.   setlength(Result, len);
  208.   p := @Result[0];
  209.   for i := 1 to self.length do
  210.   begin
  211.     cw := self[i];
  212.     cv := cw.toCharCode;
  213.     cl := cw.getUTF8Length;
  214.     case cl of
  215.       1:
  216.         begin
  217.           p^ := cv;
  218.         end;
  219.  
  220.       2:
  221.         begin
  222.           p^ := %11000000 or ((cv >>  6) and %00011111);
  223.           p += 1;
  224.           p^ := %10000000 or (cv         and %00111111);
  225.         end;
  226.  
  227.       3:
  228.         begin
  229.           p^ := %11100000 or ((cv >> 12) and %00001111);
  230.           p += 1;
  231.           p^ := %10000000 or ((cv >>  6) and %00111111);
  232.           p += 1;
  233.           p^ := %10000000 or (cv         and %00111111);
  234.         end;
  235.  
  236.       4:  raise exception.create('Characters $10000+ unsupported');
  237.     end;
  238.     p += 1;
  239.   end;
  240. end;
  241.  
  242. {
  243.   toUTF16Bytes : returns array of bytes encoded in UTF16.
  244. }
  245. function TUnicodeStringHelper.toUTF16Bytes : TBytes;
  246. var
  247.   len : integer;
  248. begin
  249.   len := self.getUTF16BytesLength;
  250.   setlength(Result, len);
  251.   move(self[1], Result[0], len);
  252. end;
  253.  
  254. {
  255.   toCPBytes : returns array of bytes. if character is beyond the 255 range, it
  256.   is converted to NULL.
  257. }
  258. function TUnicodeStringHelper.toCPBytes : TBytes;
  259. var
  260.   len, i, cv : integer;
  261. begin
  262.   len := self.getUTF8BytesLength;
  263.   setlength(Result, len);
  264.   for i := 1 to len do
  265.   begin
  266.     cv := self[i].toCharCode;
  267.     if cv > 255 then
  268.       cv := 0;
  269.     Result[i - 1] := cv;
  270.   end;
  271. end;
  272.  
  273. {
  274.   getUTF8BytesLength : returns number of bytes required to encode as UTF8.
  275. }
  276. function TUnicodeStringHelper.getUTF8BytesLength : integer;
  277. var
  278.   i : integer;
  279. begin
  280.   result := 0;
  281.   for i := 1 to system.length(self) do
  282.     result += self[i].getUTF8Length;
  283. end;
  284.  
  285. {
  286.   getUTF16BytesLength : returns number of bytes required to encode as UTF16.
  287. }
  288. function TUnicodeStringHelper.getUTF16BytesLength : integer; inline;
  289. begin
  290.   result := self.length << 1;
  291. end;
  292.  
  293. {
  294.   getCPBytesLength : returns number of bytes required to encode as codepage.
  295.   Does not consider any characters beyond the 255 charcode value.
  296. }
  297. function TUnicodeStringHelper.getCPBytesLength : integer; inline;
  298. begin
  299.   result := self.length;
  300. end;
  301.  
  302. {
  303.   fromUTF8Bytes : returns unicodestring of UTF8 in bytes. ignores broken bytes
  304.   of partial codepoints on end. use hasUTF8BrokenBytes / getUTF8BrokenBytes to
  305.   detect / retrieve the broken bytes to pump into next chunk from stream.
  306. }
  307. function TUnicodeStringHelper.fromUTF8Bytes(bytes : TBytes) : UnicodeString;
  308. var
  309.   len, pos : integer;
  310.   val : UInt32;
  311.   b : byte;
  312. begin
  313.   len := system.length(bytes);
  314.   result := '';
  315.   pos := 0;
  316.   while pos < len do
  317.   begin
  318.     b := bytes[pos];
  319.     if      (b and %11111000) = %11110000 then
  320.     begin
  321.       // 4 bytes
  322.       raise exception.create('Characters $10000+ unsupported');
  323.     end
  324.     else if (b and %11110000) = %11100000 then
  325.     begin
  326.       // 3 bytes
  327.       if pos + 3 <= len then
  328.       begin
  329.         val :=  (bytes[pos + 2] and $3F)
  330.             or ((bytes[pos + 1] and $3F) << 6)
  331.             or ((b              and $0F) << 12);
  332.         result += WideChar(val);
  333.       end;
  334.       //else broken
  335.       pos += 3;
  336.     end
  337.     else if (b and %11100000) = %11000000 then
  338.     begin
  339.       // 2 bytes
  340.       if pos + 2 <= len then
  341.       begin
  342.         val :=  (bytes[pos + 1] and $3F)
  343.             or ((b              and $1F) << 6);
  344.         result += WideChar(val);
  345.       end;
  346.       //else broken
  347.       pos += 2;
  348.  
  349.     end
  350.     else if (b and %10000000) = %00000000 then
  351.     begin
  352.       // 1 byte
  353.       result += WideChar(b);
  354.       pos += 1;
  355.     end;
  356.   end;
  357. end;
  358.  
  359. {
  360.   fromUTF16Bytes : returns unicodestring of UTF16 in bytes. ignores broken bytes
  361.   of partial codepoints on end. use hasUTF16BrokenBytes / getUTF16BrokenBytes
  362.   to detect / retrieve the broken bytes to pump into next chunk from stream.
  363. }
  364. function TUnicodeStringHelper.fromUTF16Bytes(bytes : TBytes) : UnicodeString;
  365. var
  366.   len, pos : integer;
  367. begin
  368.   len := system.length(bytes);
  369.   result := '';
  370.   pos := 0;
  371.   while pos < len do
  372.   begin
  373.     if pos + 1 < len then
  374.       result += widechar(bytes[pos] + (bytes[pos + 1] << 8)); // little endian
  375.     pos += 2;
  376.   end;
  377. end;
  378.  
  379. {
  380.   fromCPBytes : returns unicodestring of ascii in bytes.
  381. }
  382. function TUnicodeStringHelper.fromCPBytes(bytes : TBytes) : UnicodeString;
  383. var
  384.   len, i : integer;
  385. begin
  386.   len := system.length(bytes);
  387.   result := '';
  388.   for i := 0 to len - 1 do
  389.     result += WideChar(bytes[i]);
  390. end;
  391.  
  392. {
  393.   getUTF8BrokenBytes : returns left overs of broken codepoints in byte array.
  394. }
  395. function TUnicodeStringHelper.getUTF8BrokenBytes(bytes : TBytes) : TBytes;
  396. var
  397.   len, pos : integer;
  398.   b : byte;
  399. begin
  400.   len := system.length(bytes);
  401.   pos := 0;
  402.   while pos < len do
  403.   begin
  404.     b := bytes[pos];
  405.     if      (b and %11111000) = %11110000 then
  406.     begin
  407.       // 4 bytes
  408.       raise exception.create('Characters $10000+ unsupported');
  409.     end
  410.     else if (b and %11110000) = %11100000 then
  411.     begin
  412.       // 3 bytes
  413.       if pos + 3 > len then
  414.       begin
  415.         setlength(result, len - pos);
  416.         move(bytes[pos], result[0], len-pos);
  417.         exit;
  418.       end;
  419.       pos += 3;
  420.     end
  421.     else if (b and %11100000) = %11000000 then
  422.     begin
  423.       // 2 bytes
  424.       if pos + 2 > len then
  425.       begin
  426.         setlength(result, len - pos);
  427.         move(bytes[pos], result[0], len-pos);
  428.         exit;
  429.       end;
  430.       pos += 2;
  431.     end
  432.     else if (b and %10000000) = %00000000 then
  433.     begin
  434.       pos += 1;
  435.     end;
  436.   end;
  437.   setlength(result, 0);
  438. end;
  439.  
  440. {
  441.   getUTF16BrokenBytes : returns left overs of broken codepoints in byte array.
  442. }
  443. function TUnicodeStringHelper.getUTF16BrokenBytes(bytes : TBytes) : TBytes;
  444. begin
  445.   if self.HasUTF16BrokenBytes(bytes) then
  446.   begin
  447.     setlength(Result, 1);
  448.     Result[0] := bytes[system.length(bytes) - 1];
  449.   end
  450.   else
  451.     setlength(Result, 0);
  452. end;
  453.  
  454. {
  455.   getCPBrokenBytes : always returns empty byte array.
  456. }
  457. function TUnicodeStringHelper.getCPBrokenBytes(bytes : TBytes) : TBytes; inline;
  458. begin
  459.   setlength(Result, 0);
  460. end;
  461.  
  462. {
  463.   hasUTF8BrokenBytes : returns true if there is a broken codepoint at the end
  464.   of the byte array.
  465. }
  466. function TUnicodeStringHelper.hasUTF8BrokenBytes(bytes : TBytes) : boolean;
  467. var
  468.   len, pos : integer;
  469.   b : byte;
  470. begin
  471.   len := system.length(bytes);
  472.   pos := 0;
  473.   while pos < len do
  474.   begin
  475.     b := bytes[pos];
  476.     if      (b and %11111000) = %11110000 then
  477.     begin
  478.       // 4 bytes
  479.       raise exception.create('Characters $10000+ unsupported');
  480.     end
  481.     else if (b and %11110000) = %11100000 then
  482.     begin
  483.       // 3 bytes
  484.       if pos + 3 >= len then
  485.         exit(true);
  486.       pos += 3;
  487.     end
  488.     else if (b and %11100000) = %11000000 then
  489.     begin
  490.       // 2 bytes
  491.       if pos + 2 >= len then
  492.         exit(true);
  493.       pos += 2;
  494.     end
  495.     else if (b and %10000000) = %00000000 then
  496.     begin
  497.       pos += 1;
  498.     end;
  499.   end;
  500.   result := false;
  501. end;
  502.  
  503. {
  504.   hasUTF16BrokenBytes : returns true if there is a broken codepoint at the end
  505.   of the byte array.
  506. }
  507. function TUnicodeStringHelper.hasUTF16BrokenBytes(bytes : TBytes) : boolean; inline;
  508. begin
  509.   result := ((system.length(bytes) and $1) <> 0);
  510. end;
  511.  
  512. {
  513.   hasCPBrokenBytes : always returns false.
  514. }
  515. function TUnicodeStringHelper.hasCPBrokenBytes(bytes : TBytes) : boolean; inline;
  516. begin
  517.   result := false;
  518. end;
  519.  
  520. {
  521.   mapCP : will convert a codepaged unicodestring to true unicode using an
  522.   array [0..255] of WideChars. if a character is outside the 0-255 range, it
  523.   will be mapped to null.
  524. }
  525. procedure TUnicodeStringHelper.mapCP(map : TWideChars);
  526. var
  527.   len, i : integer;
  528.   pwc : PWideChar;
  529.   cpchr : integer;
  530. begin
  531.   if system.length(map) <> 256 then
  532.     raise exception.create('Invalid mapping table length. Needs 256 characters.');
  533.  
  534.   len := self.length;
  535.   pwc := getmemory(len * sizeof(WideChar));
  536.   move(self[1], pwc, len * sizeof(WideChar));
  537.   self := '';
  538.   for i := 0 to len - 1 do
  539.   begin
  540.     cpchr := pwc[i].toCharCode;
  541.     if cpchr > 255 then
  542.       cpchr := 0; // set to null if out of range.
  543.     self += map[cpchr];
  544.   end;
  545.   freememory(pwc);
  546. end;
  547.  
  548. end.
  549.  
« Last Edit: August 29, 2017, 09:56:58 am by codewar65 »

 

TinyPortal © 2005-2018