// From File
var doc = new HtmlDocument();
doc.Load(filePath, Encoding.UTF8);
foreach (HtmlNode node in doc.DocumentNode.SelectNodes(@"//div[@class='crow']"))
{
Console.WriteLine(node.OuterHtml);
foreach (HtmlNode innerNode in node.SelectNodes(@"//a|//span"))
{
var Link = innerNode.GetAttributeValue("href", "no Link");
var TextOrDate = innerNode.InnerText;
File.AppendAllText($@"{i}.txt", Link + "\r\n");
File.AppendAllText($@"{i}.txt", TextOrDate + "\r\n");
}
doc.DocumentNode.SelectNodes(@“// div [@ class ='crow']”)当我检查node.OuterHtml时,每页显示100个结果,但是当我检查innerNode.OuterHtml时捕获根本不在节点中的锚标签,而不是html页面中的其他位置。为什么// a | // span不限于节点范围内的anchor和span标签?
这就是html的乌鸦块看起来的样子:
<div class="crow">
<div>something</div>
<div class="atitle">
<a href="link" target="_blank" title="title">[title]</a>
<a href="link" target="_blank" title="name">name</a>
</div>
<div class="sj">
<span>date</span>
</div>
</div>