我使用以下方法从html中提取文本:
public string getAllText(string _html)
{
string _allText = "";
try
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(_html);
var root = document.DocumentNode;
var sb = new StringBuilder();
foreach (var node in root.DescendantNodesAndSelf())
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}
_allText = sb.ToString();
}
catch (Exception)
{
}
_allText = System.Web.HttpUtility.HtmlDecode(_allText);
return _allText;
}
问题是我也得到了脚本和样式标签。
我怎么能排除它们?
答案 0 :(得分:50)
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());
答案 1 :(得分:6)
您可以使用HtmlDocument
类:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(input);
doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());
答案 2 :(得分:1)
一些优秀的答案,System.Linq很方便!
对于非基于Linq的方法:
private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{
// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");
// Make sure not Null:
if (Nodes == null)
return webDocument;
// Remove all Nodes:
foreach (HtmlNode node in Nodes)
node.Remove();
return webDocument;
}
答案 3 :(得分:0)
public static string StripStyles(this string html)
{
var document = new HtmlDocument();
document.LoadHtml(html);
foreach (var node in document.DocumentNode.DescendantsAndSelf())
{
var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
.ToList();
foreach (var attribute in toRemove)
{
attribute.Remove();
}
}
return document.DocumentNode.OuterHtml;
}