WideChar / UnicodeString type helper

Forum > Packages and Libraries

(1/1)

codewar65:
I had a specific need for this and thought I'd share. I've been working with various encoding types in the last few projects and needed to be able to convert bytes to UTF8 / UTF16 and handling 8bit codepaged encoding/decoding. With all the existing units to handle this type of stuff i was getting confused on what string type to use and which package / unit had the functions I needed.

I can now use all UnicodeString / WideChar internally and convert to/from everything else on a as-need basis using these helpers.

Please let me know if you notice any glaring mistakes.
Thanks / enjoy.

*modified for minor bug catches.

--- Code: Pascal [+][-]window.onload = function(){var x1 = document.getElementById("main_content_section"); if (x1) { var x = document.getElementsByClassName("geshi");for (var i = 0; i < x.length; i++) { x[i].style.maxHeight='none'; x[i].style.height = Math.min(x[i].clientHeight+15,306)+'px'; x[i].style.resize = "vertical";}};} ---unit UnicodeHelper; {$mode objfpc}{$H+}{$modeswitch typehelpers} interface uses SysUtils; type TUnicodeStringArray = array of UnicodeString; TWords = array of Word; TWideChars = array of WideChar; TWideCharHelper = type helper for WideChar public function getUTF8Length : integer; // from 1 to 3 (4=not yet). function getUTF16Length : integer; // always 2 function getCPLength : integer; // always 1 function toCharCode : integer; function fromCharCode(chr : integer) : WideChar; end; TUnicodeStringHelper = type helper for UnicodeString public function length : integer; overload; function substring(index : Integer): unicodestring; overload; function substring(index : Integer; len : Integer): unicodestring; overload; function charCodeAt(index : integer) : integer; function charAt(index : integer) : WideChar; function split(const Separators: array of WideChar): TUnicodeStringArray; overload; // other String Helper type functions can be added as required. function toWideCharArray : TWideChars; function toWordArray : TWords; function toUTF8Bytes : TBytes; function toUTF16Bytes : TBytes; function toCPBytes : TBytes; function getUTF8BytesLength : integer; // varies function getUTF16BytesLength : integer; // length * 2 function getCPBytesLength : integer; // length function fromUTF8Bytes(bytes : TBytes) : UnicodeString; function fromUTF16Bytes(bytes : TBytes) : UnicodeString; function fromCPBytes(bytes : TBytes) : UnicodeString; function hasUTF8BrokenBytes(bytes : TBytes) : boolean; function hasUTF16BrokenBytes(bytes : TBytes) : boolean; function hasCPBrokenBytes(bytes : TBytes) : boolean; // always false function getUTF8BrokenBytes(bytes : TBytes) : TBytes; function getUTF16BrokenBytes(bytes : TBytes) : TBytes; function getCPBrokenBytes(bytes : TBytes) : TBytes; // always returns [] procedure mapCP(map : TWideChars); end; implementation { TWideCHarHelper } function TWideCharHelper.getUTF8Length : integer;begin if integer(self) < $80 then result := 1 else if integer(self) < $800 then result := 2 else if integer(self) < $10000 then result := 3 else result := 4;end; function TWideCharHelper.getUTF16Length : integer; inline;begin result := 2;end; function TWideCharHelper.getCPLength : integer; inline;begin result := 1;end; function TWideCharHelper.toCharCode : integer; inline;begin result := integer(self);end; function TWideCharHelper.fromCharCode(chr : integer) : WideChar; inline;begin result := WideChar(chr);end; { TUnicodeStringHelper } { length : length of the UnicodeString in WideChars.}function TUnicodeStringHelper.length : integer; inline;begin result := system.length(self);end; function TUnicodeStringHelper.substring(index : Integer): unicodestring;var strlen, len : integer;begin strlen := self.length; if (index < 0) or (index >= strlen) then result := '' else begin len := strlen - index; setlength(result, len); move(self[1 + index], result[1], len * sizeof(WideChar)); end;end; function TUnicodeStringHelper.substring(index : Integer; len : Integer): unicodestring;var strlen : integer;begin strlen := self.length; if (index < 0) or (index >= strlen) or (len <= 0) then result := '' else begin if index + len > strlen then len := strlen - index; setlength(result, len); move(self[1 + index], result[1], len * sizeof(WideChar)); end;end; function TUnicodeStringHelper.charCodeAt(index : integer) : integer; inline;begin if (index < 0) or (index >= self.length) then result := 0 else result := self[index + 1].toCharCode;end; function TUnicodeStringHelper.charAt(index : integer) : WideChar; inline;begin if (index < 0) or (index >= self.length) then result := WideChar(0) else result := self[index + 1];end; Function TUnicodeStringHelper.split(const Separators: array of WideChar): TUnicodeStringArray;var i, j, lastpos : integer; ch : widechar; x : UnicodeString;begin x := self; setlength(result, 0); lastpos := 0; for i := 0 to self.length - 1 do begin ch := self.charAt(i); for j := 0 to system.length(Separators) - 1 do begin if ch = Separators[j] then begin setlength(result, system.length(result) + 1); result[system.length(result) - 1] := self.substring(lastpos, i - lastpos); lastpos := i + 1; break; end; end; end; setlength(result, system.length(result) + 1); result[system.length(result) - 1] := self.substring(lastpos);end; function TUnicodeStringHelper.toWideCharArray : TWideChars;begin setlength(result, self.length); move(self[1], result[0], sizeof(WideChar));end; function TUnicodeStringHelper.toWordArray : TWords;var len : longint;begin len := self.length; setlength(result, len); move(self[1], result[0], sizeof(WideChar) * len);end; { toUTF8Bytes : returns array of bytes encoded in UTF8.}function TUnicodeStringHelper.toUTF8Bytes : TBytes;var cv, i, len, cl : integer; p : pbyte; cw : WideChar;begin len := self.getUTF8BytesLength; setlength(Result, len); p := @Result[0]; for i := 1 to self.length do begin cw := self[i]; cv := cw.toCharCode; cl := cw.getUTF8Length; case cl of 1: begin p^ := cv; end; 2: begin p^ := %11000000 or ((cv >> 6) and %00011111); p += 1; p^ := %10000000 or (cv and %00111111); end; 3: begin p^ := %11100000 or ((cv >> 12) and %00001111); p += 1; p^ := %10000000 or ((cv >> 6) and %00111111); p += 1; p^ := %10000000 or (cv and %00111111); end; 4: raise exception.create('Characters $10000+ unsupported'); end; p += 1; end;end; { toUTF16Bytes : returns array of bytes encoded in UTF16.}function TUnicodeStringHelper.toUTF16Bytes : TBytes;var len : integer;begin len := self.getUTF16BytesLength; setlength(Result, len); move(self[1], Result[0], len);end; { toCPBytes : returns array of bytes. if character is beyond the 255 range, it is converted to NULL.}function TUnicodeStringHelper.toCPBytes : TBytes;var len, i, cv : integer;begin len := self.getUTF8BytesLength; setlength(Result, len); for i := 1 to len do begin cv := self[i].toCharCode; if cv > 255 then cv := 0; Result[i - 1] := cv; end;end; { getUTF8BytesLength : returns number of bytes required to encode as UTF8.}function TUnicodeStringHelper.getUTF8BytesLength : integer;var i : integer;begin result := 0; for i := 1 to system.length(self) do result += self[i].getUTF8Length;end; { getUTF16BytesLength : returns number of bytes required to encode as UTF16.}function TUnicodeStringHelper.getUTF16BytesLength : integer; inline;begin result := self.length << 1;end; { getCPBytesLength : returns number of bytes required to encode as codepage. Does not consider any characters beyond the 255 charcode value.}function TUnicodeStringHelper.getCPBytesLength : integer; inline;begin result := self.length;end; { fromUTF8Bytes : returns unicodestring of UTF8 in bytes. ignores broken bytes of partial codepoints on end. use hasUTF8BrokenBytes / getUTF8BrokenBytes to detect / retrieve the broken bytes to pump into next chunk from stream.}function TUnicodeStringHelper.fromUTF8Bytes(bytes : TBytes) : UnicodeString;var len, pos : integer; val : UInt32; b : byte;begin len := system.length(bytes); result := ''; pos := 0; while pos < len do begin b := bytes[pos]; if (b and %11111000) = %11110000 then begin // 4 bytes raise exception.create('Characters $10000+ unsupported'); end else if (b and %11110000) = %11100000 then begin // 3 bytes if pos + 3 <= len then begin val := (bytes[pos + 2] and $3F) or ((bytes[pos + 1] and $3F) << 6) or ((b and $0F) << 12); result += WideChar(val); end; //else broken pos += 3; end else if (b and %11100000) = %11000000 then begin // 2 bytes if pos + 2 <= len then begin val := (bytes[pos + 1] and $3F) or ((b and $1F) << 6); result += WideChar(val); end; //else broken pos += 2; end else if (b and %10000000) = %00000000 then begin // 1 byte result += WideChar(b); pos += 1; end; end;end; { fromUTF16Bytes : returns unicodestring of UTF16 in bytes. ignores broken bytes of partial codepoints on end. use hasUTF16BrokenBytes / getUTF16BrokenBytes to detect / retrieve the broken bytes to pump into next chunk from stream.}function TUnicodeStringHelper.fromUTF16Bytes(bytes : TBytes) : UnicodeString;var len, pos : integer;begin len := system.length(bytes); result := ''; pos := 0; while pos < len do begin if pos + 1 < len then result += widechar(bytes[pos] + (bytes[pos + 1] << 8)); // little endian pos += 2; end;end; { fromCPBytes : returns unicodestring of ascii in bytes.}function TUnicodeStringHelper.fromCPBytes(bytes : TBytes) : UnicodeString;var len, i : integer;begin len := system.length(bytes); result := ''; for i := 0 to len - 1 do result += WideChar(bytes[i]);end; { getUTF8BrokenBytes : returns left overs of broken codepoints in byte array.}function TUnicodeStringHelper.getUTF8BrokenBytes(bytes : TBytes) : TBytes;var len, pos : integer; b : byte;begin len := system.length(bytes); pos := 0; while pos < len do begin b := bytes[pos]; if (b and %11111000) = %11110000 then begin // 4 bytes raise exception.create('Characters $10000+ unsupported'); end else if (b and %11110000) = %11100000 then begin // 3 bytes if pos + 3 > len then begin setlength(result, len - pos); move(bytes[pos], result[0], len-pos); exit; end; pos += 3; end else if (b and %11100000) = %11000000 then begin // 2 bytes if pos + 2 > len then begin setlength(result, len - pos); move(bytes[pos], result[0], len-pos); exit; end; pos += 2; end else if (b and %10000000) = %00000000 then begin pos += 1; end; end; setlength(result, 0);end; { getUTF16BrokenBytes : returns left overs of broken codepoints in byte array.}function TUnicodeStringHelper.getUTF16BrokenBytes(bytes : TBytes) : TBytes;begin if self.HasUTF16BrokenBytes(bytes) then begin setlength(Result, 1); Result[0] := bytes[system.length(bytes) - 1]; end else setlength(Result, 0);end; { getCPBrokenBytes : always returns empty byte array.}function TUnicodeStringHelper.getCPBrokenBytes(bytes : TBytes) : TBytes; inline;begin setlength(Result, 0);end; { hasUTF8BrokenBytes : returns true if there is a broken codepoint at the end of the byte array.}function TUnicodeStringHelper.hasUTF8BrokenBytes(bytes : TBytes) : boolean;var len, pos : integer; b : byte;begin len := system.length(bytes); pos := 0; while pos < len do begin b := bytes[pos]; if (b and %11111000) = %11110000 then begin // 4 bytes raise exception.create('Characters $10000+ unsupported'); end else if (b and %11110000) = %11100000 then begin // 3 bytes if pos + 3 >= len then exit(true); pos += 3; end else if (b and %11100000) = %11000000 then begin // 2 bytes if pos + 2 >= len then exit(true); pos += 2; end else if (b and %10000000) = %00000000 then begin pos += 1; end; end; result := false;end; { hasUTF16BrokenBytes : returns true if there is a broken codepoint at the end of the byte array.}function TUnicodeStringHelper.hasUTF16BrokenBytes(bytes : TBytes) : boolean; inline;begin result := ((system.length(bytes) and $1) <> 0);end; { hasCPBrokenBytes : always returns false.}function TUnicodeStringHelper.hasCPBrokenBytes(bytes : TBytes) : boolean; inline;begin result := false;end; { mapCP : will convert a codepaged unicodestring to true unicode using an array [0..255] of WideChars. if a character is outside the 0-255 range, it will be mapped to null.}procedure TUnicodeStringHelper.mapCP(map : TWideChars);var len, i : integer; pwc : PWideChar; cpchr : integer;begin if system.length(map) <> 256 then raise exception.create('Invalid mapping table length. Needs 256 characters.'); len := self.length; pwc := getmemory(len * sizeof(WideChar)); move(self[1], pwc, len * sizeof(WideChar)); self := ''; for i := 0 to len - 1 do begin cpchr := pwc[i].toCharCode; if cpchr > 255 then cpchr := 0; // set to null if out of range. self += map[cpchr]; end; freememory(pwc);end; end.

Navigation

[0] Message Index