program project2;
{$mode objfpc}{$H+}
uses
classes, sysutils, fphttpclient, fasthtmlparser;
type
THtmlTextExtractor = class
private
FTempStream: TStream;
FIgnore: Boolean;
function CleanWhiteSpace(AText: String): String;
function FixHtmlEntities(AText: String): String;
procedure TagFoundHandler(NoCaseTag, ActualTag: string);
procedure TextFoundHandler(AText: String);
public
function ExtractFromHtml(AHtml: String): String;
end;
function THtmlTextExtractor.CleanWhiteSpace(AText: String): String;
begin
if (AText <> '') and (AText[1] = #10) then
while (AText <> '') and (AText[1] in [#10, ' ', #9]) do Delete(AText, 1, 1);
Result := AText;
end;
function THtmlTextExtractor.FixHtmlEntities(AText: String): String;
var
P, PEnd: PChar;
s: String;
begin
Result := '';
P := @AText[1];
PEnd := P + Length(AText);
while P < PEnd do begin
if P^ = '&' then
begin
s := '';
inc(P);
while (P <= PEnd) and (P^ <> ';') do begin
s := s + P^;
inc(P);
end;
case s of
'auml' : Result := Result + 'ä';
'Auml' : Result := Result + 'Ä';
'uuml' : Result := Result + 'ü';
'Uuml' : Result := Result + 'Ü';
'ouml' : Result := Result + 'ö';
'Ouml' : Result := Result + 'Ö';
'szlig': Result := Result + 'ß';
'nbsp' : Result := Result + ' ';
'lt' : Result := Result + '<';
'gt' : Result := Result + '>';
'amp' : Result := Result + '&';
// ... add more...
end;
end else
Result := Result + P^;
inc(P);
end;
s := Result;
end;
procedure THtmlTextExtractor.TagFoundHandler(NoCaseTag, ActualTag: string);
begin
// Use the FIgnore flag to skip some tags not needed
if (Pos('<HTML', NoCasetag) = 1) or
(NoCaseTag = '</SCRIPT>') or
(NoCaseTag = '</BUTTON>')
then
FIgnore := false
else
if (Pos('<SCRIPT', NoCaseTag) = 1) or
(Pos('<BUTTON', NoCaseTag) = 1) or
(NoCaseTag = '</HTML>')
then
FIgnore := true;
if FIgnore then
exit;
// Write a line-break after these tags
if (NoCasetag = '<BR>') or (NoCaseTag = '<BR />') or (NoCaseTag = '<BR/>') or
(NoCaseTag = '</P>') or (NoCaseTag = '</DIV>') or (NoCaseTag = '</TR>')
then
FTempStream.Write(LineEnding[1], Sizeof(LineEnding));
end;
procedure THtmlTextExtractor.TextFoundHandler(AText: String);
var
s: String;
begin
if FIgnore then
exit;
s := CleanWhiteSpace(AText);
if s = '' then
exit;
s := FixHtmlEntities(s);
FTempStream.Write(s[1], Length(s));
end;
function THtmlTextExtractor.ExtractFromHtml(AHtml: String): String;
var
parser: THtmlParser;
begin
if AHtml = '' then
exit ('');
parser := THtmlParser.Create(AHtml);
FTempStream := TMemoryStream.Create;
try
parser.OnFoundTag := @TagFoundHandler;
parser.OnFoundText := @TextFoundHandler;
parser.Exec;
FTempStream.Position := 0;
SetLength(Result, FTempStream.Size);
FTempStream.Read(Result[1], FTempStream.Size);
finally
FTempStream.Free;
parser.Free;
end;
end;
procedure SaveStringToFile(AText, AFileName: String);
var
F: TextFile;
begin
AssignFile(F, AFileName);
Rewrite(F);
WriteLn(F, AText);
CloseFile(F);
end;
var
s: String;
extractor: THtmlTextExtractor;
begin
s := TFPHTTPClient.SimpleGet('https://trainingslager.onlineliga.de/#url=/player/overview?playerId=28056');
if s <> '' then begin
SaveStringToFile(s, 'text.html');
extractor := THTMLTextExtractor.Create;
try
s := extractor.ExtractFromHtml(s);
SaveStringToFile(s, 'test.txt');
finally
extractor.Free;
end;
end;
end.