我有以下C#控制台应用程序代码,它使用HTMLAgilityPack来分析一些示例HTML:
static void Main(string[] args)
{
string input = @"
<span style=""font-style: italic"">This is the title</span>.
This is the introductory text:
<ol>
<li>List Item One</li>
<li>List Item Two</li>
<li>List Item Three</li>
<li>This list item is nested:
<ol>
<li>List Item Four A.</li>
<li>List Item Four B.</li>
</ol>
Yes it is.
</li>
<li>List Item Five</li>
</ol>
This is the footer text. Last updated: July 20, 2014
";
HtmlDocument doc = new HtmlDocument();
try
{
doc.LoadHtml(input);
}
catch (Exception e)
{
LogIt("ERROR: " + e.Message);
return;
}
HtmlNode get_title = doc.DocumentNode.SelectSingleNode("//span");
if (get_title != null)
{
LogIt("Title: '" + get_title.InnerHtml + "'");
}
HtmlNodeCollection get_outer_lists = doc.DocumentNode.SelectNodes("//ol//li");
if (get_outer_lists != null)
{
foreach (HtmlNode hn_outer in get_outer_lists)
{
LogIt("Begin outer for");
LogIt("outer HTML: '" + hn_outer.OuterHtml + "'");
// Now fetch inner list, the text above the inner list, and the
// text below the inner list.
HtmlNodeCollection get_inner_lists = doc.DocumentNode.SelectNodes("//ol//li//ol//li");
if (get_inner_lists != null)
{
foreach (HtmlNode hn_inner in get_inner_lists)
{
LogIt("\tinner HTML: '" + hn_inner.OuterHtml + "'");
}
}
else
{
LogIt("ERROR: Could not get inner list");
}
}
}
else
{
LogIt("ERROR: Could not select //ol//li");
Console.Read();
return;
}
Console.Read();
return;
}
private static void LogIt(string str)
{
Console.WriteLine(str);
return;
}
......这是输出:
Title: 'This is the title'
Begin outer for
outer HTML: '<li>List Item One</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Two</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Three</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>This list item is nested:
<ol>
<li>List Item Four A.</li>
<li>List Item Four B.</li>
</ol>
Yes it is.
</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Four B.</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Five</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
答案 0 :(得分:0)
关于问题1,您可以使用text()
选择文本节点(HTML示例中的介绍性文本和页脚):
var result = doc.DocumentNode.SelectNodes("/text()");
foreach (HtmlNode r in result)
{
LogIt(r.InnerText);
}
更新:
关于问题2(如果我理解正确的话),对于外部循环XPath,你可以得到<ol>
个节点有另一个<ol>
节点后代,然后得到直接子{{1}来自前面提到的<li>
的节点,如下所示:
<ol>