我有带unicode符号的html标记:
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML xmlns:o = "urn:schemas-microsoft-com:office:office"><HEAD>
<META content="text/html; charset=windows-1251" http-equiv=Content-Type>
<META name=GENERATOR content="MSHTML 9.00.8112.16441"></HEAD>
<BODY>
<P>ψ</P></BODY></HTML>
我使用IHTMLTxtRange.pasteHTML插入符号ψ
。当我使用HTMLDocument2.body.innerHTML时,我想得到<P>ψ</P>
,但不是Unicode字符串函数的字符串表示,而是返回Unicode BSTR
其中ψ
(ψ)是Unicode字符$ 03C8
答案 0 :(得分:0)
另一种解决方法
function GetInnerHTMLFromBody(const ADocument: IHTMLDOCUMENT2): AnsiString;
var
ms: TMemoryStream;
startBody: integer;
stopBody: integer;
const
bodyTag = '<BODY>';
closedBodyTag = '</BODY>';
begin
Result := '';
if ADocument <> nil then
begin
ms := TMemoryStream.Create;
try
Succeeded((ADocument as IPersistStreamInit).Save(
TStreamAdapter.Create(ms, soReference) as IStream, true));
ms.Seek(0, soFromBeginning);
SetLength(Result, ms.size);
ms.ReadBuffer(Result[1], ms.size);
// better to use regexpr
startBody := AnsiPos(bodyTag, Result) + Length(bodyTag);
stopBody := AnsiPos(closedBodyTag, Result);
Result := Copy(Result, startBody, stopBody - startBody);
finally
ms.Free;
end;
end;
end;
但是,此方法仅适用于ANSI编码html文档。如果Unicode编码需要进行从Unicode到AnsiString的额外转换:
if SameText(Utf8ToAnsi(UTF8Encode(HTMLDocument2.charset)),'unicode') then
...