WideChar / UnicodeString type helper

I had a specific need for this and thought I'd share. I've been working with various encoding types in the last few projects and needed to be able to convert bytes to UTF8 / UTF16 and handling 8bit codepaged encoding/decoding. With all the existing units to handle this type of stuff i was getting confused on what string type to use and which package / unit had the functions I needed.

I can now use all UnicodeString / WideChar internally and convert to/from everything else on a as-need basis using these helpers.

Please let me know if you notice any glaring mistakes.
Thanks / enjoy.

*modified for minor bug catches.
Code: Pascal [Select][+]
unit UnicodeHelper;
 
{$mode objfpc}{$H+}
{$modeswitch typehelpers}
 
interface
 
uses
  SysUtils;
 
type
 
  TUnicodeStringArray = array of UnicodeString;
 
  TWords = array of Word;
 
  TWideChars = array of WideChar;
 
  TWideCharHelper = type helper for WideChar
    public
      function getUTF8Length : integer;   // from 1 to 3 (4=not yet).
      function getUTF16Length : integer;  // always 2
      function getCPLength : integer;     // always 1
      function toCharCode : integer;
      function fromCharCode(chr : integer) : WideChar;
  end;
 
  TUnicodeStringHelper = type helper for UnicodeString
    public
      function length : integer; overload;
      function substring(index : Integer): unicodestring; overload;
      function substring(index : Integer; len : Integer): unicodestring; overload;
      function charCodeAt(index : integer) : integer;
      function charAt(index : integer) : WideChar;
      function split(const Separators: array of WideChar): TUnicodeStringArray; overload;
      // other String Helper type functions can be added as required.
 
      function toWideCharArray : TWideChars;
      function toWordArray : TWords;
 
      function toUTF8Bytes : TBytes;
      function toUTF16Bytes : TBytes;
      function toCPBytes : TBytes;
 
      function getUTF8BytesLength : integer;                  // varies
      function getUTF16BytesLength : integer;                 // length * 2
      function getCPBytesLength : integer;                    // length
 
      function fromUTF8Bytes(bytes : TBytes) : UnicodeString;
      function fromUTF16Bytes(bytes : TBytes) : UnicodeString;
      function fromCPBytes(bytes : TBytes) : UnicodeString;
 
      function hasUTF8BrokenBytes(bytes : TBytes) : boolean;
      function hasUTF16BrokenBytes(bytes : TBytes) : boolean;
      function hasCPBrokenBytes(bytes : TBytes) : boolean;    // always false
 
      function getUTF8BrokenBytes(bytes : TBytes) : TBytes;
      function getUTF16BrokenBytes(bytes : TBytes) : TBytes;
      function getCPBrokenBytes(bytes : TBytes) : TBytes;     // always returns []
 
      procedure mapCP(map : TWideChars);
  end;
 
 
implementation
 
{ TWideCHarHelper }
 
function TWideCharHelper.getUTF8Length : integer;
begin
  if integer(self) < $80 then           result := 1
  else if integer(self)  < $800 then    result := 2
  else if integer(self)  < $10000 then  result := 3
  else                                  result := 4;
end;
 
function TWideCharHelper.getUTF16Length : integer; inline;
begin
  result := 2;
end;
 
function TWideCharHelper.getCPLength : integer; inline;
begin
  result := 1;
end;
 
function TWideCharHelper.toCharCode : integer; inline;
begin
  result := integer(self);
end;
 
function TWideCharHelper.fromCharCode(chr : integer) : WideChar; inline;
begin
  result := WideChar(chr);
end;
 
{ TUnicodeStringHelper }
 
{
  length : length of the UnicodeString in WideChars.
}
function TUnicodeStringHelper.length : integer; inline;
begin
  result := system.length(self);
end;
 
function TUnicodeStringHelper.substring(index : Integer): unicodestring;
var
  strlen, len : integer;
begin
  strlen := self.length;
  if (index < 0) or (index >= strlen) then
    result := ''
  else
  begin
    len := strlen - index;
    setlength(result, len);
    move(self[1 + index], result[1], len * sizeof(WideChar));
  end;
end;
 
function TUnicodeStringHelper.substring(index : Integer; len : Integer): unicodestring;
var
  strlen : integer;
begin
  strlen := self.length;
  if (index < 0) or (index >= strlen) or (len <= 0) then
    result := ''
  else
  begin
    if index + len > strlen then
      len := strlen - index;
    setlength(result, len);
    move(self[1 + index], result[1], len * sizeof(WideChar));
  end;
end;
 
function TUnicodeStringHelper.charCodeAt(index : integer) : integer; inline;
begin
  if (index < 0) or (index >= self.length) then
    result := 0
  else
    result := self[index + 1].toCharCode;
end;
 
function TUnicodeStringHelper.charAt(index : integer) : WideChar; inline;
begin
  if (index < 0) or (index >= self.length) then
    result := WideChar(0)
  else
    result := self[index + 1];
end;
 
Function TUnicodeStringHelper.split(const Separators: array of WideChar): TUnicodeStringArray;
var
  i, j, lastpos : integer;
  ch : widechar;
 
  x : UnicodeString;
begin
  x := self;
  setlength(result, 0);
  lastpos := 0;
  for i := 0 to self.length - 1 do
  begin
    ch := self.charAt(i);
    for j := 0 to system.length(Separators) - 1 do
    begin
      if ch = Separators[j] then
      begin
        setlength(result, system.length(result) + 1);
        result[system.length(result) - 1] := self.substring(lastpos, i - lastpos);
        lastpos := i + 1;
        break;
      end;
    end;
  end;
  setlength(result, system.length(result) + 1);
  result[system.length(result) - 1] := self.substring(lastpos);
end;
 
function TUnicodeStringHelper.toWideCharArray : TWideChars;
begin
  setlength(result, self.length);
  move(self[1], result[0], sizeof(WideChar));
end;
 
function TUnicodeStringHelper.toWordArray : TWords;
var
  len : longint;
begin
  len := self.length;
  setlength(result, len);
  move(self[1], result[0], sizeof(WideChar) * len);
end;
 
{
  toUTF8Bytes : returns array of bytes encoded in UTF8.
}
function TUnicodeStringHelper.toUTF8Bytes : TBytes;
var
  cv, i, len, cl :  integer;
  p :               pbyte;
  cw :              WideChar;
begin
  len := self.getUTF8BytesLength;
  setlength(Result, len);
  p := @Result[0];
  for i := 1 to self.length do
  begin
    cw := self[i];
    cv := cw.toCharCode;
    cl := cw.getUTF8Length;
    case cl of
      1:
        begin
          p^ := cv;
        end;
 
      2:
        begin
          p^ := %11000000 or ((cv >>  6) and %00011111);
          p += 1;
          p^ := %10000000 or (cv         and %00111111);
        end;
 
      3:
        begin
          p^ := %11100000 or ((cv >> 12) and %00001111);
          p += 1;
          p^ := %10000000 or ((cv >>  6) and %00111111);
          p += 1;
          p^ := %10000000 or (cv         and %00111111);
        end;
 
      4:  raise exception.create('Characters $10000+ unsupported');
    end;
    p += 1;
  end;
end;
 
{
  toUTF16Bytes : returns array of bytes encoded in UTF16.
}
function TUnicodeStringHelper.toUTF16Bytes : TBytes;
var
  len : integer;
begin
  len := self.getUTF16BytesLength;
  setlength(Result, len);
  move(self[1], Result[0], len);
end;
 
{
  toCPBytes : returns array of bytes. if character is beyond the 255 range, it
  is converted to NULL.
}
function TUnicodeStringHelper.toCPBytes : TBytes;
var
  len, i, cv : integer;
begin
  len := self.getUTF8BytesLength;
  setlength(Result, len);
  for i := 1 to len do
  begin
    cv := self[i].toCharCode;
    if cv > 255 then
      cv := 0;
    Result[i - 1] := cv;
  end;
end;
 
{
  getUTF8BytesLength : returns number of bytes required to encode as UTF8.
}
function TUnicodeStringHelper.getUTF8BytesLength : integer;
var
  i : integer;
begin
  result := 0;
  for i := 1 to system.length(self) do
    result += self[i].getUTF8Length;
end;
 
{
  getUTF16BytesLength : returns number of bytes required to encode as UTF16.
}
function TUnicodeStringHelper.getUTF16BytesLength : integer; inline;
begin
  result := self.length << 1;
end;
 
{
  getCPBytesLength : returns number of bytes required to encode as codepage.
  Does not consider any characters beyond the 255 charcode value.
}
function TUnicodeStringHelper.getCPBytesLength : integer; inline;
begin
  result := self.length;
end;
 
{
  fromUTF8Bytes : returns unicodestring of UTF8 in bytes. ignores broken bytes
  of partial codepoints on end. use hasUTF8BrokenBytes / getUTF8BrokenBytes to
  detect / retrieve the broken bytes to pump into next chunk from stream.
}
function TUnicodeStringHelper.fromUTF8Bytes(bytes : TBytes) : UnicodeString;
var
  len, pos : integer;
  val : UInt32;
  b : byte;
begin
  len := system.length(bytes);
  result := '';
  pos := 0;
  while pos < len do
  begin
    b := bytes[pos];
    if      (b and %11111000) = %11110000 then
    begin
      // 4 bytes
      raise exception.create('Characters $10000+ unsupported');
    end
    else if (b and %11110000) = %11100000 then
    begin
      // 3 bytes
      if pos + 3 <= len then
      begin
        val :=  (bytes[pos + 2] and $3F)
            or ((bytes[pos + 1] and $3F) << 6)
            or ((b              and $0F) << 12);
        result += WideChar(val);
      end;
      //else broken
      pos += 3;
    end
    else if (b and %11100000) = %11000000 then
    begin
      // 2 bytes
      if pos + 2 <= len then
      begin
        val :=  (bytes[pos + 1] and $3F)
            or ((b              and $1F) << 6);
        result += WideChar(val);
      end;
      //else broken
      pos += 2;
 
    end
    else if (b and %10000000) = %00000000 then
    begin
      // 1 byte
      result += WideChar(b);
      pos += 1;
    end;
  end;
end;
 
{
  fromUTF16Bytes : returns unicodestring of UTF16 in bytes. ignores broken bytes
  of partial codepoints on end. use hasUTF16BrokenBytes / getUTF16BrokenBytes
  to detect / retrieve the broken bytes to pump into next chunk from stream.
}
function TUnicodeStringHelper.fromUTF16Bytes(bytes : TBytes) : UnicodeString;
var
  len, pos : integer;
begin
  len := system.length(bytes);
  result := '';
  pos := 0;
  while pos < len do
  begin
    if pos + 1 < len then
      result += widechar(bytes[pos] + (bytes[pos + 1] << 8)); // little endian
    pos += 2;
  end;
end;
 
{
  fromCPBytes : returns unicodestring of ascii in bytes.
}
function TUnicodeStringHelper.fromCPBytes(bytes : TBytes) : UnicodeString;
var
  len, i : integer;
begin
  len := system.length(bytes);
  result := '';
  for i := 0 to len - 1 do
    result += WideChar(bytes[i]);
end;
 
{
  getUTF8BrokenBytes : returns left overs of broken codepoints in byte array.
}
function TUnicodeStringHelper.getUTF8BrokenBytes(bytes : TBytes) : TBytes;
var
  len, pos : integer;
  b : byte;
begin
  len := system.length(bytes);
  pos := 0;
  while pos < len do
  begin
    b := bytes[pos];
    if      (b and %11111000) = %11110000 then
    begin
      // 4 bytes
      raise exception.create('Characters $10000+ unsupported');
    end
    else if (b and %11110000) = %11100000 then
    begin
      // 3 bytes
      if pos + 3 > len then
      begin
        setlength(result, len - pos);
        move(bytes[pos], result[0], len-pos);
        exit;
      end;
      pos += 3;
    end
    else if (b and %11100000) = %11000000 then
    begin
      // 2 bytes
      if pos + 2 > len then
      begin
        setlength(result, len - pos);
        move(bytes[pos], result[0], len-pos);
        exit;
      end;
      pos += 2;
    end
    else if (b and %10000000) = %00000000 then
    begin
      pos += 1;
    end;
  end;
  setlength(result, 0);
end;
 
{
  getUTF16BrokenBytes : returns left overs of broken codepoints in byte array.
}
function TUnicodeStringHelper.getUTF16BrokenBytes(bytes : TBytes) : TBytes;
begin
  if self.HasUTF16BrokenBytes(bytes) then
  begin
    setlength(Result, 1);
    Result[0] := bytes[system.length(bytes) - 1];
  end
  else
    setlength(Result, 0);
end;
 
{
  getCPBrokenBytes : always returns empty byte array.
}
function TUnicodeStringHelper.getCPBrokenBytes(bytes : TBytes) : TBytes; inline;
begin
  setlength(Result, 0);
end;
 
{
  hasUTF8BrokenBytes : returns true if there is a broken codepoint at the end
  of the byte array.
}
function TUnicodeStringHelper.hasUTF8BrokenBytes(bytes : TBytes) : boolean;
var
  len, pos : integer;
  b : byte;
begin
  len := system.length(bytes);
  pos := 0;
  while pos < len do
  begin
    b := bytes[pos];
    if      (b and %11111000) = %11110000 then
    begin
      // 4 bytes
      raise exception.create('Characters $10000+ unsupported');
    end
    else if (b and %11110000) = %11100000 then
    begin
      // 3 bytes
      if pos + 3 >= len then
        exit(true);
      pos += 3;
    end
    else if (b and %11100000) = %11000000 then
    begin
      // 2 bytes
      if pos + 2 >= len then
        exit(true);
      pos += 2;
    end
    else if (b and %10000000) = %00000000 then
    begin
      pos += 1;
    end;
  end;
  result := false;
end;
 
{
  hasUTF16BrokenBytes : returns true if there is a broken codepoint at the end
  of the byte array.
}
function TUnicodeStringHelper.hasUTF16BrokenBytes(bytes : TBytes) : boolean; inline;
begin
  result := ((system.length(bytes) and $1) <> 0);
end;
 
{
  hasCPBrokenBytes : always returns false.
}
function TUnicodeStringHelper.hasCPBrokenBytes(bytes : TBytes) : boolean; inline;
begin
  result := false;
end;
 
{
  mapCP : will convert a codepaged unicodestring to true unicode using an
  array [0..255] of WideChars. if a character is outside the 0-255 range, it
  will be mapped to null.
}
procedure TUnicodeStringHelper.mapCP(map : TWideChars);
var
  len, i : integer;
  pwc : PWideChar;
  cpchr : integer;
begin
  if system.length(map) <> 256 then
    raise exception.create('Invalid mapping table length. Needs 256 characters.');
 
  len := self.length;
  pwc := getmemory(len * sizeof(WideChar));
  move(self[1], pwc, len * sizeof(WideChar));
  self := '';
  for i := 0 to len - 1 do
  begin
    cpchr := pwc[i].toCharCode;
    if cpchr > 255 then
      cpchr := 0; // set to null if out of range.
    self += map[cpchr];
  end;
  freememory(pwc);
end;
 
end.
 
Lazarus

Bookstore

Search

Recent

Author Topic: WideChar / UnicodeString type helper (Read 2183 times)

codewar65

WideChar / UnicodeString type helper

	Computer Math and Games in Pascal (preview)
	Lazarus Handbook