program Detect4Enc;
{$mode objfpc}
uses
SysUtils, Classes;
type
TEnc = (encASCII, encUTF8, encCP850, encCP1252);
const
// German umlauts + ß
DEChars: UnicodeString =
'äöüÄÖÜß';
Garbage: UnicodeString =
'∆µ†¤';
BoxChars: UnicodeString =
'░▒▓│┤┼┐└═';
function LoadBytes(const FN: string): TBytes;
var
FS: TFileStream;
begin
FS := TFileStream.Create(FN, fmOpenRead or fmShareDenyWrite);
try
SetLength(Result, FS.Size);
FS.ReadBuffer(Result[0], FS.Size);
finally
FS.Free;
end;
end;
function IsASCII(const B: TBytes): Boolean;
var
i: Integer;
begin
Result := True;
for i := 0 to High(B) do
if B[i] > 127 then Exit(False);
end;
function IsRealUTF8(const B: TBytes): Boolean;
var
i, n: Integer;
HasMulti: Boolean;
begin
i := 0;
HasMulti := False;
while i < Length(B) do
begin
if B[i] < $80 then
Inc(i)
else
begin
HasMulti := True;
if (B[i] and $E0) = $C0 then n := 1
else if (B[i] and $F0) = $E0 then n := 2
else if (B[i] and $F8) = $F0 then n := 3
else Exit(False);
Inc(i);
while n > 0 do
begin
if (i >= Length(B)) or ((B[i] and $C0) <> $80) then Exit(False);
Inc(i);
Dec(n);
end;
end;
end;
Result := HasMulti;
end;
function Score(const S: UnicodeString): Integer;
var
i: Integer;
begin
Result := 0;
for i := 1 to Length(S) do
begin
if S[i] in ['A'..'Z','a'..'z'] then Inc(Result);
if Pos(S[i], DEChars) > 0 then Inc(Result, 6);
if Pos(S[i], Garbage) > 0 then Dec(Result, 20);
if Pos(S[i], BoxChars) > 0 then Dec(Result, 25);
end;
// German language patterns
if Pos(' der ', S) > 0 then Inc(Result, 20);
if Pos(' die ', S) > 0 then Inc(Result, 20);
if Pos(' und ', S) > 0 then Inc(Result, 15);
if Pos('sch', S) > 0 then Inc(Result, 10);
if Pos('über', S) > 0 then Inc(Result, 25);
if Pos('ß', S) > 0 then Inc(Result, 20);
end;
function DetectEncoding(const FN: string): TEnc;
var
B: TBytes;
S850, S1252: UnicodeString;
A, B2: Integer;
begin
B := LoadBytes(FN);
if IsASCII(B) then
Exit(encASCII);
if IsRealUTF8(B) then
Exit(encUTF8);
S850 := TEncoding.GetEncoding(850).GetString(B);
S1252 := TEncoding.GetEncoding(1252).GetString(B);
A := Score(S850);
B2 := Score(S1252);
if B2 > A then
Result := encCP1252
else
Result := encCP850;
end;
procedure Test(const FN: string);
begin
Write(FN, ' -> ');
case DetectEncoding(FN) of
encASCII: Writeln('ASCII');
encUTF8: Writeln('UTF8');
encCP850: Writeln('CP850');
encCP1252: Writeln('CP1252');
end;
end;
begin
Test('demo1.txt');
Test('demo2.txt');
Test('demo3.txt');
Test('demo4.txt');
ReadLn;
end.