Recent

Author Topic: Converting non-ASCII characters to closest ASCII [in Linux]  (Read 6299 times)

fedkad

  • Full Member
  • ***
  • Posts: 178
Re: Converting non-ASCII characters to closest ASCII [in Linux]
« Reply #15 on: June 23, 2019, 03:04:49 pm »
I tested with the function iconvert from unit iconvenc. Although it is 5-6 times slower than the "USASCIIString = type AnsiString(20127);" solution (in Windows), it seems to be an acceptable alternative for Linux. The code I will be using is something like:

Code: Pascal  [Select][+][-]
  1. uses
  2.   iconvenc;
  3. (* ... *)
  4. procedure TForm1.Button1Click(Sender: TObject);
  5. var
  6.   s2 : String = '';
  7. begin
  8.   iconvert(memo1.text,s2,'UTF-8','ASCII//TRANSLIT');
  9.   memo2.text := s2;
  10. end;

Unfortunately, some characters are converted differently. For example, 'Æ' is converted to two characters 'AE', and '£' to 'GBP'. I need a method to convert one character to one character. For example, the 'Æ' to 'A', and the '£' to 'L' or even '?'. That is, the number of characters in the result string should be the same as in the source string.

Note: I think, that the lower-level functions iconv_open, iconv, and iconv_close may perform better.
« Last Edit: June 23, 2019, 04:25:05 pm by fedkad »
Lazarus 4.0 / FPC 3.2.2 on x86_64-linux-gtk2 (Ubuntu/GNOME) and x86_64-win64-win32/win64 (Windows 11)

lucamar

  • Hero Member
  • *****
  • Posts: 4217
Re: Converting non-ASCII characters to closest ASCII [in Linux]
« Reply #16 on: June 23, 2019, 05:56:15 pm »
Unfortunately, some characters are converted differently. For example, 'Æ' is converted to two characters 'AE', and '£' to 'GBP'. [...]

Note that those are actually the correct conversions, just like € textual representation is "EUR". If that's not what you need, it shouldn't be too difficult to convert those (few) instances manually (e.g. with StringReplace()) before calling iconvert.
Turbo Pascal 3 CP/M - Amstrad PCW 8256 (512 KB !!!) :P
Lazarus/FPC 2.0.8/3.0.4 & 2.0.12/3.2.0 - 32/64 bits on:
(K|L|X)Ubuntu 12..18, Windows XP, 7, 10 and various DOSes.

engkin

  • Hero Member
  • *****
  • Posts: 3112
Re: Converting non-ASCII characters to closest ASCII [in Linux]
« Reply #17 on: June 28, 2019, 10:42:28 am »
LConvEncoding unit has a nice function UTF8ToSingleByte. It needs a function that converts from Unicode to CP20127. I managed to automate writing such a function based on Windows conversion results. The outcome is a simple UTF8ToUSASCII:
Code: Pascal  [Select][+][-]
  1. unit cp_usascii;
  2.  
  3. {$mode objfpc}{$H+}
  4.  
  5. interface
  6.  
  7. type
  8.   USASCIIString = type AnsiString(20127);
  9.  
  10. function UTF8ToUSASCII(AStr: String): USASCIIString;
  11.  
  12. implementation
  13.  
  14. uses
  15.   LConvEncoding;
  16.  
  17. function UnicodeToCP20127(Unicode: Cardinal): integer;
  18. begin
  19.   case Unicode of
  20.     $0000..$007F: Result := Unicode - 0;
  21.     $00C1..$00C6: Result := 65;
  22.     $00C7..$00C8: Result := Unicode - 134;
  23.     $00C9..$00CB: Result := 69;
  24.     $00CD..$00CF: Result := 73;
  25.     $00D3..$00D6: Result := 79;
  26.     $00DA..$00DC: Result := 85;
  27.     $00E1..$00E6: Result := 97;
  28.     $00E7..$00E8: Result := Unicode - 134;
  29.     $00E9..$00EB: Result := 101;
  30.     $00ED..$00EF: Result := 105;
  31.     $00F3..$00F6: Result := 111;
  32.     $00FA..$00FC: Result := 117;
  33.     $2001..$2006: Result := 32;
  34.     $2013..$2014: Result := 45;
  35.     $201D..$201E: Result := 34;
  36.     $FF02..$FF1E: Result := Unicode - 65249;
  37.     $FF21..$FF5E: Result := Unicode - 65249;
  38.     $00A0,$2000: Result := 32;
  39.     $00A1,$FF01: Result := 33;
  40.     $02BA,$030E,$201C: Result := 34;
  41.     $00A4: Result := 36;
  42.     $02B9,$02BC,$02C8,$2018,$2019,$2032: Result := 39;
  43.     $00B8,$201A: Result := 44;
  44.     $00AD,$2010,$2011: Result := 45;
  45.     $00B7,$2022,$2026: Result := 46;
  46.     $00B9: Result := 49;
  47.     $00B2: Result := 50;
  48.     $00B3: Result := 51;
  49.     $00AB,$2039: Result := 60;
  50.     $00BB,$203A: Result := 62;
  51.     $10000: Result := 63;
  52.     $FF20: Result := 64;
  53.     $00C0,$0100,$0102,$0104,$01CD,$01DE: Result := 65;
  54.     $00A9,$0106,$0108,$010A,$010C: Result := 67;
  55.     $00D0,$010E,$0110,$0189: Result := 68;
  56.     $0112,$0114,$0116,$0118,$011A: Result := 69;
  57.     $0191: Result := 70;
  58.     $011C,$011E,$0120,$0122,$01E4,$01E6: Result := 71;
  59.     $0124,$0126: Result := 72;
  60.     $00CC,$0128,$012A,$012C,$012E,$0130,$0197,$01CF: Result := 73;
  61.     $0134: Result := 74;
  62.     $0136,$01E8: Result := 75;
  63.     $0139,$013B,$013D,$0141: Result := 76;
  64.     $00D1,$0143,$0145,$0147: Result := 78;
  65.     $00D2,$00D8,$014C,$014E,$0150,$0152,$019F,$01A0,$01D1,$01EA,$01EC: Result := 79;
  66.     $00AE,$0154,$0156,$0158: Result := 82;
  67.     $015A,$015C,$015E,$0160: Result := 83;
  68.     $0162,$0164,$0166,$01AE,$2122: Result := 84;
  69.     $00D9,$0168,$016A,$016C,$016E,$0170,$0172,$01AF,$01D3,$01D5,$01D7,$01D9,$01DB: Result := 85;
  70.     $0174: Result := 87;
  71.     $00A5,$00DD,$0176,$0178: Result := 89;
  72.     $0179,$017B,$017D: Result := 90;
  73.     $02C4,$02C6,$0302: Result := 94;
  74.     $02CD,$0331,$0332: Result := 95;
  75.     $02CB,$0300,$2035: Result := 96;
  76.     $00AA,$00E0,$0101,$0103,$0105,$01CE,$01DF: Result := 97;
  77.     $0180: Result := 98;
  78.     $00A2,$0107,$0109,$010B,$010D: Result := 99;
  79.     $010F,$0111: Result := 100;
  80.     $0113,$0115,$0117,$0119,$011B: Result := 101;
  81.     $0192: Result := 102;
  82.     $011D,$011F,$0121,$0123,$01E5,$01E7,$0261: Result := 103;
  83.     $0125,$0127: Result := 104;
  84.     $00EC,$0129,$012B,$012D,$012F,$0131,$01D0: Result := 105;
  85.     $0135,$01F0: Result := 106;
  86.     $0137,$01E9: Result := 107;
  87.     $013A,$013C,$013E,$0142,$019A: Result := 108;
  88.     $00F1,$0144,$0146,$0148: Result := 110;
  89.     $00BA,$00F2,$00F8,$014D,$014F,$0151,$0153,$01A1,$01D2,$01EB,$01ED: Result := 111;
  90.     $0155,$0157,$0159: Result := 114;
  91.     $015B,$015D,$015F,$0161: Result := 115;
  92.     $0163,$0165,$0167,$01AB: Result := 116;
  93.     $00F9,$0169,$016B,$016D,$016F,$0171,$0173,$01B0,$01D4,$01D6,$01D8,$01DA,$01DC: Result := 117;
  94.     $0175: Result := 119;
  95.     $00FD,$00FF,$0177: Result := 121;
  96.     $017A,$017C,$017E,$01B6: Result := 122;
  97.     $00A6: Result := 124;
  98.     $02DC,$0303: Result := 126;
  99.   else
  100.     Result := $3F;
  101.   end;
  102. end;
  103.  
  104. function UTF8ToUSASCII(AStr: String): USASCIIString;
  105. begin
  106.   Result := UTF8ToSingleByte(AStr, @UnicodeToCP20127);
  107. end;
  108.  
  109. end.

Probably it is possible to use unit charset from the RTL instead of LConvEncoding if needed.

Anyway, if you decided to use UTF8ToUSASCII, make sure to test it first. The few simple tests I did seemed to pass.

fedkad

  • Full Member
  • ***
  • Posts: 178
Re: Converting non-ASCII characters to closest ASCII [in Linux]
« Reply #18 on: June 30, 2019, 01:55:07 pm »
Thank you very much engkin for your effort. I think I will go with your solution.

Please, note that your code has some errors in some code points like:

Code: Pascal  [Select][+][-]
  1.  $00C7..$00C8: Result := Unicode - 134;
and
Code: Pascal  [Select][+][-]
  1.  $00E7..$00E8: Result := Unicode - 134;

I will control and modify it according to my needs!
Lazarus 4.0 / FPC 3.2.2 on x86_64-linux-gtk2 (Ubuntu/GNOME) and x86_64-win64-win32/win64 (Windows 11)

engkin

  • Hero Member
  • *****
  • Posts: 3112
Re: Converting non-ASCII characters to closest ASCII [in Linux]
« Reply #19 on: June 30, 2019, 06:19:26 pm »
Yes, you are right. There was a bug that caused these errors. The code after fixing the bug is:
Code: Pascal  [Select][+][-]
  1. unit cp_usascii;
  2.  
  3. {$mode objfpc}{$H+}
  4.  
  5. interface
  6.  
  7. type
  8.   USASCIIString = type AnsiString(20127);
  9.  
  10. function UTF8ToUSASCII(AStr: String): USASCIIString;
  11.  
  12. implementation
  13.  
  14. uses
  15.   LConvEncoding;
  16.  
  17. function UnicodeToCP20127(Unicode: Cardinal): integer;
  18. begin
  19.   case Unicode of
  20.     $0000..$007F: Result := Unicode - 0;
  21.     $00C1..$00C6: Result := 65;
  22.     $00C9..$00CB: Result := 69;
  23.     $00CD..$00CF: Result := 73;
  24.     $00D3..$00D6: Result := 79;
  25.     $00DA..$00DC: Result := 85;
  26.     $00E1..$00E6: Result := 97;
  27.     $00E9..$00EB: Result := 101;
  28.     $00ED..$00EF: Result := 105;
  29.     $00F3..$00F6: Result := 111;
  30.     $00FA..$00FC: Result := 117;
  31.     $2001..$2006: Result := 32;
  32.     $2013..$2014: Result := 45;
  33.     $201D..$201E: Result := 34;
  34.     $FF02..$FF1E: Result := Unicode - 65248;
  35.     $FF21..$FF5E: Result := Unicode - 65248;
  36.     $00A0,$2000: Result := 32;
  37.     $00A1,$FF01: Result := 33;
  38.     $02BA,$030E,$201C: Result := 34;
  39.     $00A4: Result := 36;
  40.     $02B9,$02BC,$02C8,$2018,$2019,$2032: Result := 39;
  41.     $00B8,$201A: Result := 44;
  42.     $00AD,$2010,$2011: Result := 45;
  43.     $00B7,$2022,$2026: Result := 46;
  44.     $00B9: Result := 49;
  45.     $00B2: Result := 50;
  46.     $00B3: Result := 51;
  47.     $00AB,$2039: Result := 60;
  48.     $00BB,$203A: Result := 62;
  49.     $10000: Result := 63;
  50.     $FF20: Result := 64;
  51.     $00C0,$0100,$0102,$0104,$01CD,$01DE: Result := 65;
  52.     $00A9,$00C7,$0106,$0108,$010A,$010C: Result := 67;
  53.     $00D0,$010E,$0110,$0189: Result := 68;
  54.     $00C8,$0112,$0114,$0116,$0118,$011A: Result := 69;
  55.     $0191: Result := 70;
  56.     $011C,$011E,$0120,$0122,$01E4,$01E6: Result := 71;
  57.     $0124,$0126: Result := 72;
  58.     $00CC,$0128,$012A,$012C,$012E,$0130,$0197,$01CF: Result := 73;
  59.     $0134: Result := 74;
  60.     $0136,$01E8: Result := 75;
  61.     $0139,$013B,$013D,$0141: Result := 76;
  62.     $00D1,$0143,$0145,$0147: Result := 78;
  63.     $00D2,$00D8,$014C,$014E,$0150,$0152,$019F,$01A0,$01D1,$01EA,$01EC: Result := 79;
  64.     $00AE,$0154,$0156,$0158: Result := 82;
  65.     $015A,$015C,$015E,$0160: Result := 83;
  66.     $0162,$0164,$0166,$01AE,$2122: Result := 84;
  67.     $00D9,$0168,$016A,$016C,$016E,$0170,$0172,$01AF,$01D3,$01D5,$01D7,$01D9,$01DB: Result := 85;
  68.     $0174: Result := 87;
  69.     $00A5,$00DD,$0176,$0178: Result := 89;
  70.     $0179,$017B,$017D: Result := 90;
  71.     $02C4,$02C6,$0302: Result := 94;
  72.     $02CD,$0331,$0332: Result := 95;
  73.     $02CB,$0300,$2035: Result := 96;
  74.     $00AA,$00E0,$0101,$0103,$0105,$01CE,$01DF: Result := 97;
  75.     $0180: Result := 98;
  76.     $00A2,$00E7,$0107,$0109,$010B,$010D: Result := 99;
  77.     $010F,$0111: Result := 100;
  78.     $00E8,$0113,$0115,$0117,$0119,$011B: Result := 101;
  79.     $0192: Result := 102;
  80.     $011D,$011F,$0121,$0123,$01E5,$01E7,$0261: Result := 103;
  81.     $0125,$0127: Result := 104;
  82.     $00EC,$0129,$012B,$012D,$012F,$0131,$01D0: Result := 105;
  83.     $0135,$01F0: Result := 106;
  84.     $0137,$01E9: Result := 107;
  85.     $013A,$013C,$013E,$0142,$019A: Result := 108;
  86.     $00F1,$0144,$0146,$0148: Result := 110;
  87.     $00BA,$00F2,$00F8,$014D,$014F,$0151,$0153,$01A1,$01D2,$01EB,$01ED: Result := 111;
  88.     $0155,$0157,$0159: Result := 114;
  89.     $015B,$015D,$015F,$0161: Result := 115;
  90.     $0163,$0165,$0167,$01AB: Result := 116;
  91.     $00F9,$0169,$016B,$016D,$016F,$0171,$0173,$01B0,$01D4,$01D6,$01D8,$01DA,$01DC: Result := 117;
  92.     $0175: Result := 119;
  93.     $00FD,$00FF,$0177: Result := 121;
  94.     $017A,$017C,$017E,$01B6: Result := 122;
  95.     $00A6: Result := 124;
  96.     $02DC,$0303: Result := 126;
  97.   else
  98.     Result := $3F;
  99.   end;
  100. end;
  101.  
  102. function UTF8ToUSASCII(AStr: String): USASCIIString;
  103. begin
  104.   Result := UTF8ToSingleByte(AStr, @UnicodeToCP20127);
  105. end;
  106.  
  107. end.

I also discovered that Windows translates Unicodes>U+FFFF into two question marks "??" instead of one. My code does not. It should be easy to replicate Windows behavior if you really want.

 

TinyPortal © 2005-2018