我需要将HTML解析为xElement。我理解这个解决方案对格式错误的HTML不是很宽容。这很好,因为我想要陷入无效的HTML。但是,每当我遇到HTML实体时,我都不希望XElement.Parse()方法失败。
我想知道.NET框架中是否内置了将命名HTML实体转换为数字字符引用的内容。
这很有效,但我真的不想为每个实体都这样做。
Public Function GetEntityReplacementList() As IDictionary(Of String, String)
'http://www.w3.org/TR/html4/sgml/entities.html
Dim _dictonary As New Dictionary(Of String, String)
_dictonary.Add(" ", " ") ' " " non-breaking space
_dictonary.Add("<", "<") '< less than
_dictonary.Add(">", ">") '> greater than
_dictonary.Add("&", "&") '& ampersand
_dictonary.Add("¢", "¢") '¢ cent
_dictonary.Add("£", "£") '£ pound
_dictonary.Add("¥", "¥") '¥ yen
_dictonary.Add("€", "€") '€ euro
_dictonary.Add("©", "©") '© copyright
_dictonary.Add("®", "®") '® registered trademark
_dictonary.Add("‘", "‘") ' single quote
_dictonary.Add("’", "’") ' single quote
_dictonary.Add("“", "“") ' Double quote
_dictonary.Add("”", "”") ' Double quote
_dictonary.Add("•", "•") ' Bullet
_dictonary.Add("ç", "Ç")
_dictonary.Add("ë", "Ç")
_dictonary.Add("é", "é")
_dictonary.Add("—", "—")
_dictonary.Add("è", "È")
_dictonary.Add("á", "á")
_dictonary.Add("–", "–")
Return _dictonary
End Function
<Extension()>
Public Function CreateXElementWithEntityReplacements(p_xml As String) As XElement
For Each _pair In GetEntityReplacementList()
p_xml = Regex.Replace(p_xml, _pair.Key, _pair.Value, RegexOptions.IgnoreCase)
Next
Return XElement.Parse(p_xml)
End Function