hello,
finally, it seems that the
Xpath unit of the
fcl-xml works well with html documents.
here is a small project (in attachment) using
xpath for searching some elements in html pages.
1 - Search for all the title attributes of recent_topics in home page of lazarus forum.
2 - Search for all the href attributes of the defined tree path in the pcurtis html table.
3 - Search for td which contains "Page" string in the pcurtis html table.
program parseHtmlTest;
// J.P October 2020
{$mode objfpc}{$H+}
uses classes, fphttpclient, opensslsockets, DOM, DOM_HTML, SAX_HTML, XPath;
var
htmlDoc: THTMLDocument;
XPathRes: TXPathVariable;
XPathExp: DomString;
TheNodeSet : TNodeSet;
s: String;
htmlStream: TStringStream;
procedure DisplayResult( NS: TNodeSet);
var TheNode : Pointer;
begin
For TheNode in NS do
begin
Writeln(TDomNode(TheNode).TextContent);
end;
Writeln('===================================');
end;
{$R *.res}
begin
try
s := TFPCustomHTTPClient.SimpleGet('https://forum.lazarus.freepascal.org/index.php');
htmlStream := TStringStream.Create(s);
ReadHTMLFile(htmlDoc, htmlStream);
// search for all the title attributes of recent_topics in home page of lazarus forum
XPathExp := '//ul[@class="recent_topics"]/li/a/@title';
XPathRes := EvaluateXPathExpression(XPathExp, htmlDoc.DocumentElement);
TheNodeSet := XPathRes.AsNodeSet;
DisplayResult(TheNodeSet);
// read input html file
ReadHTMLFile(htmlDoc, 'table.html');
// search for all the href attributes in the defined tree path
XPathExp := '//table[@class="tborder"]/tr/td/a/@href';
XPathRes := EvaluateXPathExpression(XPathExp, htmlDoc.DocumentElement);
TheNodeSet := XPathRes.AsNodeSet;
DisplayResult(TheNodeSet);
// Search for td which contains "Page" string
XPathExp := '//td[contains(text(),"Page")]';
XPathRes := EvaluateXPathExpression(XPathExp, htmlDoc.DocumentElement);
TheNodeSet := XPathRes.AsNodeSet;
DisplayResult(TheNodeSet);
XPathRes.Free;
finally
htmlDoc.Free;
htmlStream.Free;
end;
Readln;
end.
tested with Lazarus 2.0.10 fpc 3.2.0 on windows 10 and Centos 8.1
Result in attachment.
Friendly, J.P