我正在尝试从html页面(尤其是commentList类)提取url链接,标题和时间,并将其放在列表中。我该如何实现?
这是html页面:
<div class="filter-Holder">
<div class="list">
<ul class="commentList">
<li>
<div class="time">19:45</div>
<h5>
<a href="https://urlLink.com" class="title">textTitle</a>
</h5>
这是我到目前为止的代码:
var url = "https://www.url.com/";
var webGet = new HtmlWeb();
var document = webGet.Load(url);
string urllink = document.DocumentNode.SelectNodes("//*
[@id='content']/div[4]/div/div/div/ul")[0].InnerText;
foreach (var i in urllink)
{
Console.Write(i);
}
var linksOnPage = from
lnks in document.DocumentNode.Descendants()
where
lnks.Name == "a" &&
lnks.Attributes["href"] != null &&
lnks.InnerText.Trim().Length > 0
select new
{
Url = lnks.Attributes["href"].Value,
Text = lnks.InnerText,
};
答案 0 :(得分:0)
取决于HTML与您的HTML匹配程度如何,应该可以解决问题。
private static void Main(string[] args)
{
string htmlFragment = "" +
"<div class=\"filter-Holder\">" +
" <div class=\"list\">" +
" <ul class=\"commentList\">" +
" <li>" +
" <div class=\"time\">19:45</div>" +
" <h5>" +
" <a href=\"https://urlLink.com\" class=\"title\">textTitle</a>" +
" </h5>" +
" </li>" +
" </ul>" +
" </div>" +
"</div>";
// CommentListItem
var doc = new HtmlDocument();
doc.LoadHtml(htmlFragment);
// get all of the unordered lists where the class is `commentList`
foreach (var unorderedList in doc.DocumentNode.SelectNodes("//ul[@class='commentList']") ?? new HtmlNodeCollection(doc.DocumentNode))
{
// loop all of the list items in this list
foreach (var listItem in unorderedList.SelectNodes("//li"))
{
// get the first DIV with a class of `time`
var timeNode = listItem.SelectNodes("//div[@class='time']").FirstOrDefault();
if (timeNode != null)
{
var time = timeNode.InnerHtml;
}
// get the first anchor tag, nested under a H5 element
var anchorNode = listItem.SelectNodes("//h5/a").FirstOrDefault();
if (anchorNode != null)
{
var href = ParseAnchorHref(anchorNode);
var title = ParseAnchorClass(anchorNode);
var text = anchorNode.InnerText;
}
}
}
Console.ReadLine();
}
private static string ParseAnchorHref(HtmlNode node) => node.Attributes.Contains("href") ? node.Attributes["href"].Value : string.Empty;
private static string ParseAnchorClass(HtmlNode node) => node.Attributes.Contains("class") ? node.Attributes["class"].Value : string.Empty;