Web.HTTPApp.HTMLDecode错误,无效字符

时间:2017-09-19 11:00:58

标签: html delphi delphi-10.1-berlin

在Delphi 10.1.2应用程序中,Web.HTTPApp.HTMLDecode在找到“无效的HTML编码字符”时会创建异常:

try
  Web.HTTPApp.HTMLDecode('bad & good');
except
  on E: Exception do
  begin
    ShowMessage('Exception class name = ' + E.ClassName);
    ShowMessage('Exception message = ' + E.Message);
  end;
end;

如何让它只是解码HTML字符实体并单独保留其他字符?

对于快速思考者,我已经尝试过这个:

Web.HTTPApp.HTMLDecode(Web.HTTPApp.HTMLEncode('bad & good'));

但这不适用于此字符串:

Web.HTTPApp.HTMLDecode(Web.HTTPApp.HTMLEncode('controls & components, bad & good'));

1 个答案:

答案 0 :(得分:0)

IdStrings.StrHtmlDecode是解决方案:它解码HTML字符实体,同时保留单独的&符号,而不仅仅是HTML字符实体的一部分:

IdStrings.StrHtmlDecode('controls & components, bad & good');

这回复:

'控制&组件,坏和&好'

至少它没有抛出异常。

当然,有无数的迹象。我添加了一些对我自己来说很重要的内容:

function PAStrHtmlDecode (const AStr: string): string;
begin
  Result := StringReplace(Result, ' ',      ' ',  [rfReplaceAll, rfIgnoreCase]); // non-breaking space
  Result := StringReplace(AStr,   '"',      '"',  [rfReplaceAll, rfIgnoreCase]); // double quotation mark
  Result := StringReplace(Result, '>',        '>',  [rfReplaceAll, rfIgnoreCase]); // greater than
  Result := StringReplace(Result, '&lt;',        '<',  [rfReplaceAll, rfIgnoreCase]); // less than
  Result := StringReplace(Result, '&amp;',       '&',  [rfReplaceAll, rfIgnoreCase]); // ampersand
  Result := StringReplace(Result, '&reg;',       '®',  [rfReplaceAll, rfIgnoreCase]); // registered trademark
  Result := StringReplace(Result, '&copy;',      '©',  [rfReplaceAll, rfIgnoreCase]); // copyright
  Result := StringReplace(Result, '&trade;',     '™',  [rfReplaceAll, rfIgnoreCase]); // TRADEMARK
  Result := StringReplace(Result, '&euro;',      '€',  [rfReplaceAll, rfIgnoreCase]); // euro
  Result := StringReplace(Result, '&yen;',       '¥',  [rfReplaceAll, rfIgnoreCase]); // yen
  Result := StringReplace(Result, '&pound;',     '£',  [rfReplaceAll, rfIgnoreCase]); // pound
  Result := StringReplace(Result, '&cent;',      '¢',  [rfReplaceAll, rfIgnoreCase]); // cent
  Result := StringReplace(Result, '&dollar;',    '$',  [rfReplaceAll, rfIgnoreCase]); // dollar
  Result := StringReplace(Result, '&apos;',      '''', [rfReplaceAll, rfIgnoreCase]); // single quotation mark (apostrophe)
  // Some Mathematical Symbols Supported by HTML:
  Result := StringReplace(Result, '&#8704;',     '∀', [rfReplaceAll, rfIgnoreCase]); // FOR ALL
  Result := StringReplace(Result, '&part;',      '∂',  [rfReplaceAll, rfIgnoreCase]); // PARTIAL DIFFERENTIAL
  Result := StringReplace(Result, '&exist;',     '∃', [rfReplaceAll, rfIgnoreCase]); // THERE EXISTS
  Result := StringReplace(Result, '&nabla;',     '∇', [rfReplaceAll, rfIgnoreCase]); // NABLA
  Result := StringReplace(Result, '&isin;',      '∈', [rfReplaceAll, rfIgnoreCase]); // ELEMENT OF
  Result := StringReplace(Result, '&ni;',        '∋', [rfReplaceAll, rfIgnoreCase]); // CONTAINS AS MEMBER
  Result := StringReplace(Result, '&prod;',      '∏',  [rfReplaceAll, rfIgnoreCase]); // N-ARY PRODUCT
  Result := StringReplace(Result, '&sum;',       '∑',  [rfReplaceAll, rfIgnoreCase]); // N-ARY SUMMATION
  Result := StringReplace(Result, '&Phi;',       'Φ',  [rfReplaceAll, rfIgnoreCase]); // GREEK CAPITAL LETTER PHI
  Result := StringReplace(Result, '&Pi;',        'Π',  [rfReplaceAll, rfIgnoreCase]); // GREEK CAPITAL LETTER PI
  // Some Other Entities Supported by HTML:
  Result := StringReplace(Result, '&larr;',      '←',  [rfReplaceAll, rfIgnoreCase]); // LEFTWARDS ARROW
  Result := StringReplace(Result, '&rarr;',      '→',  [rfReplaceAll, rfIgnoreCase]); // RIGHTWARDS ARROW
  Result := StringReplace(Result, '&uarr;',      '↑',  [rfReplaceAll, rfIgnoreCase]); // UPWARDS ARROW
  Result := StringReplace(Result, '&darr;',      '↓',  [rfReplaceAll, rfIgnoreCase]); // DOWNWARDS ARROW
  Result := StringReplace(Result, '&excl;',      '!',  [rfReplaceAll, rfIgnoreCase]); // exclamation
  Result := StringReplace(Result, '&num;',       '#',  [rfReplaceAll, rfIgnoreCase]); // number
  Result := StringReplace(Result, '&percnt;',    '%',  [rfReplaceAll, rfIgnoreCase]); // percent
  Result := StringReplace(Result, '&plusmn;',    '±',  [rfReplaceAll, rfIgnoreCase]); // plusminus
  Result := StringReplace(Result, '&pm;',        '±',  [rfReplaceAll, rfIgnoreCase]); // plusminus
  Result := StringReplace(Result, '&sup2;',      '²',  [rfReplaceAll, rfIgnoreCase]); // SUPERSCRIPT TWO
  Result := StringReplace(Result, '&sup3;',      '³',  [rfReplaceAll, rfIgnoreCase]); // SUPERSCRIPT THREE
  Result := StringReplace(Result, '&micro;',     'µ',  [rfReplaceAll, rfIgnoreCase]); // MICRO SIGN
  Result := StringReplace(Result, '&para;',      '¶',  [rfReplaceAll, rfIgnoreCase]); // PILCROW SIGN
  Result := StringReplace(Result, '&middot;',    '·',  [rfReplaceAll, rfIgnoreCase]); // MIDDLE DOT
  Result := StringReplace(Result, '&centerdot;', '·',  [rfReplaceAll, rfIgnoreCase]); // MIDDLE DOT
  Result := StringReplace(Result, '&frac14;',    '¼',  [rfReplaceAll, rfIgnoreCase]); // VULGAR FRACTION ONE QUARTER
  Result := StringReplace(Result, '&frac12;',    '½',  [rfReplaceAll, rfIgnoreCase]); // VULGAR FRACTION ONE HALF
  Result := StringReplace(Result, '&half;',      '½',  [rfReplaceAll, rfIgnoreCase]); // VULGAR FRACTION ONE HALF
  Result := StringReplace(Result, '&frac34;',    '¾',  [rfReplaceAll, rfIgnoreCase]); // VULGAR FRACTION THREE QUARTERS
  Result := StringReplace(Result, '&times;',     '×',  [rfReplaceAll, rfIgnoreCase]); // MULTPLICATION SIGN
  Result := StringReplace(Result, '&divide;',    '÷',  [rfReplaceAll, rfIgnoreCase]); // DIVISION SIGN
  Result := StringReplace(Result, '&div;',       '÷',  [rfReplaceAll, rfIgnoreCase]); // DIVISION SIGN
  Result := StringReplace(Result, '&szlig;',     'ß',  [rfReplaceAll, rfIgnoreCase]); // LATIN SMALL LETTER SHARP S
  Result := StringReplace(Result, '&ldquo;',     '“',  [rfReplaceAll, rfIgnoreCase]); // LEFT DOUBLE QUOTATION MARK
  Result := StringReplace(Result, '&bdquo;',     '„',  [rfReplaceAll, rfIgnoreCase]); // DOUBLE LOW-9 QUOTATION MARK
  Result := StringReplace(Result, '&rdquo;',     '”',  [rfReplaceAll, rfIgnoreCase]); // RIGHT DOUBLE QUOTATION MARK
  Result := StringReplace(Result, '&bull;',      '•',  [rfReplaceAll, rfIgnoreCase]); // BULLET
  Result := StringReplace(Result, '&hellip;',    '…',  [rfReplaceAll, rfIgnoreCase]); // HORIZONTAL ELLIPSIS
  Result := StringReplace(Result, '&permil;',    '‰',  [rfReplaceAll, rfIgnoreCase]); // PER MILLE SIGN
  Result := StringReplace(Result, '&frac13;',    '⅓',  [rfReplaceAll, rfIgnoreCase]); // VULGAR FRACTION ONE THIRD
  Result := StringReplace(Result, '&phone;',     '☎', [rfReplaceAll, rfIgnoreCase]); // BLACK TELEPHONE
  Result := StringReplace(Result, '&female;',    '♀',  [rfReplaceAll, rfIgnoreCase]); // FEMALE SIGN
  Result := StringReplace(Result, '&male;',      '♂',  [rfReplaceAll, rfIgnoreCase]); // MALE SIGN
  Result := ReplaceNumericHtmlEntities(Result);
end;

更新:我添加了一个解析数字HTML字符实体的函数。以下是示例代码:

uses
  //CodeSiteLogging,
  Vcl.Dialogs,

  System.Character,
  System.RegularExpressions,
  System.RegularExpressionsCore,
  System.SysUtils;

const
  TESTSUBJECT =
    'This is &#120146; text ' +
    'containing &#34;HTML Character Entities&#34; ' +
    'such as heart (&#9829;) or ' +
    'a diamond (&#9830;)';

function ReplaceNumericHtmlEntities(const AStr: string): string;
var
  RegexObj: TRegEx;
  MatchResults: TMatch;
  ThisEnt: string;
  function ReplaceNumericHtmlEntity(AEnt: string): string;
  var
    C: Char;
    N: string;
    thisnumber: Integer;
  begin
    ReplaceNumericHtmlEntity := '';
    N := '';
    for C in AEnt do
    begin
      if System.Character.IsNumber(C) then
        N := N + C;
    end;
    if N <> '' then
    begin
      thisnumber := StrToIntDef(N, -1);
      if thisnumber > -1 then
      begin
        if thisnumber > 65535 then
          ReplaceNumericHtmlEntity := TCharacter.ConvertFromUtf32(thisnumber)
        else
          ReplaceNumericHtmlEntity := Chr(thisnumber);
      end;
    end;
  end;
begin
  Result := AStr;

  try
    RegexObj := TRegEx.Create('&#\d{2,6};', [roIgnoreCase]);
    MatchResults := RegexObj.Match(Result);
    while MatchResults.Success do
    begin
      ThisEnt := ReplaceNumericHtmlEntity(MatchResults.Value);
      if ThisEnt <> '' then
      Result := StringReplace(Result, MatchResults.Value, ThisEnt, [rfReplaceAll]);
      MatchResults := MatchResults.NextMatch();
    end;
  except
    on E: ERegularExpressionError do
    begin
      // Syntax error in the regular expression
    end;
  end;
end;

用法:

//CodeSite.Send('ReplaceNumericHtmlEntities:', ReplaceNumericHtmlEntities(TESTSUBJECT));
ShowMessage(ReplaceNumericHtmlEntities(TESTSUBJECT));