Forum > Packages and Libraries

WideChar / UnicodeString type helper

(1/1)

codewar65:
I had a specific need for this and thought I'd share. I've been working with various encoding types in the last few projects and needed to be able to convert bytes to UTF8 / UTF16 and handling 8bit codepaged encoding/decoding. With all the existing units to handle this type of stuff i was getting confused on what string type to use and which package / unit had the functions I needed.

I can now use all UnicodeString / WideChar internally and convert to/from everything else on a as-need basis using these helpers.

Please let me know if you notice any glaring mistakes.
Thanks / enjoy.

*modified for minor bug catches.


--- Code: Pascal  [+][-]window.onload = function(){var x1 = document.getElementById("main_content_section"); if (x1) { var x = document.getElementsByClassName("geshi");for (var i = 0; i < x.length; i++) { x[i].style.maxHeight='none'; x[i].style.height = Math.min(x[i].clientHeight+15,306)+'px'; x[i].style.resize = "vertical";}};} ---unit UnicodeHelper; {$mode objfpc}{$H+}{$modeswitch typehelpers} interface uses  SysUtils; type   TUnicodeStringArray = array of UnicodeString;   TWords = array of Word;   TWideChars = array of WideChar;   TWideCharHelper = type helper for WideChar    public      function getUTF8Length : integer;   // from 1 to 3 (4=not yet).      function getUTF16Length : integer;  // always 2      function getCPLength : integer;     // always 1      function toCharCode : integer;      function fromCharCode(chr : integer) : WideChar;  end;   TUnicodeStringHelper = type helper for UnicodeString    public      function length : integer; overload;      function substring(index : Integer): unicodestring; overload;      function substring(index : Integer; len : Integer): unicodestring; overload;      function charCodeAt(index : integer) : integer;      function charAt(index : integer) : WideChar;      function split(const Separators: array of WideChar): TUnicodeStringArray; overload;      // other String Helper type functions can be added as required.       function toWideCharArray : TWideChars;      function toWordArray : TWords;       function toUTF8Bytes : TBytes;      function toUTF16Bytes : TBytes;      function toCPBytes : TBytes;       function getUTF8BytesLength : integer;                  // varies      function getUTF16BytesLength : integer;                 // length * 2      function getCPBytesLength : integer;                    // length       function fromUTF8Bytes(bytes : TBytes) : UnicodeString;      function fromUTF16Bytes(bytes : TBytes) : UnicodeString;      function fromCPBytes(bytes : TBytes) : UnicodeString;       function hasUTF8BrokenBytes(bytes : TBytes) : boolean;      function hasUTF16BrokenBytes(bytes : TBytes) : boolean;      function hasCPBrokenBytes(bytes : TBytes) : boolean;    // always false       function getUTF8BrokenBytes(bytes : TBytes) : TBytes;      function getUTF16BrokenBytes(bytes : TBytes) : TBytes;      function getCPBrokenBytes(bytes : TBytes) : TBytes;     // always returns []       procedure mapCP(map : TWideChars);  end;  implementation { TWideCHarHelper } function TWideCharHelper.getUTF8Length : integer;begin  if integer(self) < $80 then           result := 1  else if integer(self)  < $800 then    result := 2  else if integer(self)  < $10000 then  result := 3  else                                  result := 4;end; function TWideCharHelper.getUTF16Length : integer; inline;begin  result := 2;end; function TWideCharHelper.getCPLength : integer; inline;begin  result := 1;end; function TWideCharHelper.toCharCode : integer; inline;begin  result := integer(self);end; function TWideCharHelper.fromCharCode(chr : integer) : WideChar; inline;begin  result := WideChar(chr);end; { TUnicodeStringHelper } {  length : length of the UnicodeString in WideChars.}function TUnicodeStringHelper.length : integer; inline;begin  result := system.length(self);end; function TUnicodeStringHelper.substring(index : Integer): unicodestring;var  strlen, len : integer;begin  strlen := self.length;  if (index < 0) or (index >= strlen) then    result := ''  else  begin    len := strlen - index;    setlength(result, len);    move(self[1 + index], result[1], len * sizeof(WideChar));  end;end; function TUnicodeStringHelper.substring(index : Integer; len : Integer): unicodestring;var  strlen : integer;begin  strlen := self.length;  if (index < 0) or (index >= strlen) or (len <= 0) then    result := ''  else  begin    if index + len > strlen then      len := strlen - index;    setlength(result, len);    move(self[1 + index], result[1], len * sizeof(WideChar));  end;end; function TUnicodeStringHelper.charCodeAt(index : integer) : integer; inline;begin  if (index < 0) or (index >= self.length) then    result := 0  else    result := self[index + 1].toCharCode;end; function TUnicodeStringHelper.charAt(index : integer) : WideChar; inline;begin  if (index < 0) or (index >= self.length) then    result := WideChar(0)  else    result := self[index + 1];end; Function TUnicodeStringHelper.split(const Separators: array of WideChar): TUnicodeStringArray;var  i, j, lastpos : integer;  ch : widechar;   x : UnicodeString;begin  x := self;  setlength(result, 0);  lastpos := 0;  for i := 0 to self.length - 1 do  begin    ch := self.charAt(i);    for j := 0 to system.length(Separators) - 1 do    begin      if ch = Separators[j] then      begin        setlength(result, system.length(result) + 1);        result[system.length(result) - 1] := self.substring(lastpos, i - lastpos);        lastpos := i + 1;        break;      end;    end;  end;  setlength(result, system.length(result) + 1);  result[system.length(result) - 1] := self.substring(lastpos);end; function TUnicodeStringHelper.toWideCharArray : TWideChars;begin  setlength(result, self.length);  move(self[1], result[0], sizeof(WideChar));end; function TUnicodeStringHelper.toWordArray : TWords;var  len : longint;begin  len := self.length;  setlength(result, len);  move(self[1], result[0], sizeof(WideChar) * len);end; {  toUTF8Bytes : returns array of bytes encoded in UTF8.}function TUnicodeStringHelper.toUTF8Bytes : TBytes;var  cv, i, len, cl :  integer;  p :               pbyte;  cw :              WideChar;begin  len := self.getUTF8BytesLength;  setlength(Result, len);  p := @Result[0];  for i := 1 to self.length do  begin    cw := self[i];    cv := cw.toCharCode;    cl := cw.getUTF8Length;    case cl of      1:        begin          p^ := cv;        end;       2:        begin          p^ := %11000000 or ((cv >>  6) and %00011111);          p += 1;          p^ := %10000000 or (cv         and %00111111);        end;       3:        begin          p^ := %11100000 or ((cv >> 12) and %00001111);          p += 1;          p^ := %10000000 or ((cv >>  6) and %00111111);          p += 1;          p^ := %10000000 or (cv         and %00111111);        end;       4:  raise exception.create('Characters $10000+ unsupported');    end;    p += 1;  end;end; {  toUTF16Bytes : returns array of bytes encoded in UTF16.}function TUnicodeStringHelper.toUTF16Bytes : TBytes;var  len : integer;begin  len := self.getUTF16BytesLength;  setlength(Result, len);  move(self[1], Result[0], len);end; {  toCPBytes : returns array of bytes. if character is beyond the 255 range, it  is converted to NULL.}function TUnicodeStringHelper.toCPBytes : TBytes;var  len, i, cv : integer;begin  len := self.getUTF8BytesLength;  setlength(Result, len);  for i := 1 to len do  begin    cv := self[i].toCharCode;    if cv > 255 then      cv := 0;    Result[i - 1] := cv;  end;end; {  getUTF8BytesLength : returns number of bytes required to encode as UTF8.}function TUnicodeStringHelper.getUTF8BytesLength : integer;var  i : integer;begin  result := 0;  for i := 1 to system.length(self) do    result += self[i].getUTF8Length;end; {  getUTF16BytesLength : returns number of bytes required to encode as UTF16.}function TUnicodeStringHelper.getUTF16BytesLength : integer; inline;begin  result := self.length << 1;end; {  getCPBytesLength : returns number of bytes required to encode as codepage.  Does not consider any characters beyond the 255 charcode value.}function TUnicodeStringHelper.getCPBytesLength : integer; inline;begin  result := self.length;end; {  fromUTF8Bytes : returns unicodestring of UTF8 in bytes. ignores broken bytes  of partial codepoints on end. use hasUTF8BrokenBytes / getUTF8BrokenBytes to  detect / retrieve the broken bytes to pump into next chunk from stream.}function TUnicodeStringHelper.fromUTF8Bytes(bytes : TBytes) : UnicodeString;var  len, pos : integer;  val : UInt32;  b : byte;begin  len := system.length(bytes);  result := '';  pos := 0;  while pos < len do  begin    b := bytes[pos];    if      (b and %11111000) = %11110000 then    begin      // 4 bytes      raise exception.create('Characters $10000+ unsupported');    end    else if (b and %11110000) = %11100000 then    begin      // 3 bytes      if pos + 3 <= len then      begin        val :=  (bytes[pos + 2] and $3F)            or ((bytes[pos + 1] and $3F) << 6)            or ((b              and $0F) << 12);        result += WideChar(val);      end;      //else broken      pos += 3;    end    else if (b and %11100000) = %11000000 then    begin      // 2 bytes      if pos + 2 <= len then      begin        val :=  (bytes[pos + 1] and $3F)            or ((b              and $1F) << 6);        result += WideChar(val);      end;      //else broken      pos += 2;     end    else if (b and %10000000) = %00000000 then    begin      // 1 byte      result += WideChar(b);      pos += 1;    end;  end;end; {  fromUTF16Bytes : returns unicodestring of UTF16 in bytes. ignores broken bytes  of partial codepoints on end. use hasUTF16BrokenBytes / getUTF16BrokenBytes  to detect / retrieve the broken bytes to pump into next chunk from stream.}function TUnicodeStringHelper.fromUTF16Bytes(bytes : TBytes) : UnicodeString;var  len, pos : integer;begin  len := system.length(bytes);  result := '';  pos := 0;  while pos < len do  begin    if pos + 1 < len then      result += widechar(bytes[pos] + (bytes[pos + 1] << 8)); // little endian    pos += 2;  end;end; {  fromCPBytes : returns unicodestring of ascii in bytes.}function TUnicodeStringHelper.fromCPBytes(bytes : TBytes) : UnicodeString;var  len, i : integer;begin  len := system.length(bytes);  result := '';  for i := 0 to len - 1 do    result += WideChar(bytes[i]);end; {  getUTF8BrokenBytes : returns left overs of broken codepoints in byte array.}function TUnicodeStringHelper.getUTF8BrokenBytes(bytes : TBytes) : TBytes;var  len, pos : integer;  b : byte;begin  len := system.length(bytes);  pos := 0;  while pos < len do  begin    b := bytes[pos];    if      (b and %11111000) = %11110000 then    begin      // 4 bytes      raise exception.create('Characters $10000+ unsupported');    end    else if (b and %11110000) = %11100000 then    begin      // 3 bytes      if pos + 3 > len then      begin        setlength(result, len - pos);        move(bytes[pos], result[0], len-pos);        exit;      end;      pos += 3;    end    else if (b and %11100000) = %11000000 then    begin      // 2 bytes      if pos + 2 > len then      begin        setlength(result, len - pos);        move(bytes[pos], result[0], len-pos);        exit;      end;      pos += 2;    end    else if (b and %10000000) = %00000000 then    begin      pos += 1;    end;  end;  setlength(result, 0);end; {  getUTF16BrokenBytes : returns left overs of broken codepoints in byte array.}function TUnicodeStringHelper.getUTF16BrokenBytes(bytes : TBytes) : TBytes;begin  if self.HasUTF16BrokenBytes(bytes) then  begin    setlength(Result, 1);    Result[0] := bytes[system.length(bytes) - 1];  end  else    setlength(Result, 0);end; {  getCPBrokenBytes : always returns empty byte array.}function TUnicodeStringHelper.getCPBrokenBytes(bytes : TBytes) : TBytes; inline;begin  setlength(Result, 0);end; {  hasUTF8BrokenBytes : returns true if there is a broken codepoint at the end  of the byte array.}function TUnicodeStringHelper.hasUTF8BrokenBytes(bytes : TBytes) : boolean;var  len, pos : integer;  b : byte;begin  len := system.length(bytes);  pos := 0;  while pos < len do  begin    b := bytes[pos];    if      (b and %11111000) = %11110000 then    begin      // 4 bytes      raise exception.create('Characters $10000+ unsupported');    end    else if (b and %11110000) = %11100000 then    begin      // 3 bytes      if pos + 3 >= len then        exit(true);      pos += 3;    end    else if (b and %11100000) = %11000000 then    begin      // 2 bytes      if pos + 2 >= len then        exit(true);      pos += 2;    end    else if (b and %10000000) = %00000000 then    begin      pos += 1;    end;  end;  result := false;end; {  hasUTF16BrokenBytes : returns true if there is a broken codepoint at the end  of the byte array.}function TUnicodeStringHelper.hasUTF16BrokenBytes(bytes : TBytes) : boolean; inline;begin  result := ((system.length(bytes) and $1) <> 0);end; {  hasCPBrokenBytes : always returns false.}function TUnicodeStringHelper.hasCPBrokenBytes(bytes : TBytes) : boolean; inline;begin  result := false;end; {  mapCP : will convert a codepaged unicodestring to true unicode using an  array [0..255] of WideChars. if a character is outside the 0-255 range, it  will be mapped to null.}procedure TUnicodeStringHelper.mapCP(map : TWideChars);var  len, i : integer;  pwc : PWideChar;  cpchr : integer;begin  if system.length(map) <> 256 then    raise exception.create('Invalid mapping table length. Needs 256 characters.');   len := self.length;  pwc := getmemory(len * sizeof(WideChar));  move(self[1], pwc, len * sizeof(WideChar));  self := '';  for i := 0 to len - 1 do  begin    cpchr := pwc[i].toCharCode;    if cpchr > 255 then      cpchr := 0; // set to null if out of range.    self += map[cpchr];  end;  freememory(pwc);end; end. 

Navigation

[0] Message Index

Go to full version