Error in ReadHTMLFile

Summary

ReadHTMLFile procedure (from SAX_HTML unit) throws exception 'EDOMError in DOMDocument.CreateElement', code 5 (INVALID_CHARACTER_ERR) when reading HTML text with less-than sign in quoted attribute value.

System Information

  • Operating system: Windows 11
  • Processor architecture: x86-64
  • Compiler version: 3.3.1-14158-g6fda6f79 [2023/10/23] for x86_64
  • Device: Computer

Steps to reproduce

Run example project.

Example Project

program htmlproblem;
 
{$mode objfpc}{$H+}
 
uses SysUtils, Classes, DOM, DOM_HTML, SAX_HTML;
 
var
  sHTML: String =
        '<!DOCTYPE html><html lang="en"><head><title>Test</title></head><body>' +
        '<div title="test<"></div>' +
        '</body></html>';
  HTMLDocument: THTMLDocument;
  iPos: Integer;
 
procedure ReadHTML(AHTML: String);
begin
  WriteLn('Reading: ', AHTML);
  try
    try
      ReadHTMLFile(HTMLDocument, TStringStream.Create(AHTML));
      WriteLn('Read OK.');
    except
      on E: Exception do
      begin
        WriteLn('Read not OK: ', E.Message);
        If E is EDOMError then
          WriteLn('EDOMError code: ', (E as EDOMError).Code);
      end;
    end;
    WriteLn;
  finally
    HTMLDocument.Free;
  end;
end;
 
begin
  ReadHTML(sHTML);
 
  // remove less-than sign
  iPos := Pos('<">', sHTML);
  Delete(sHTML, iPos, 1);
 
  ReadHTML(sHTML);
end.