我正试图抓取一个网站来获取数据。到目前为止,我得到它至少连接到网站,但现在当我尝试使用数据设置文本框的文本时,我只是得到了一堆:
HtmlAgilityPack.HtmlNodeCollection
与数据相同, HtmlAgilityPack.HtmlNodeCollection 的数量相同。这是我的代码(我知道它有点草率):
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using System;
using HtmlAgilityPack;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
string choice;
public Form1()
{
InitializeComponent();
}
public void comboBox1_SelectedIndexChanged(object sender, System.EventArgs e)
{
}
public void button1_Click(object sender, System.EventArgs e)
{
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
string urlToLoad = "http://www.nbcwashington.com/weather/school-closings/";
HttpWebRequest request = HttpWebRequest.Create(urlToLoad) as HttpWebRequest;
request.Method = "GET";
Console.WriteLine(request.RequestUri.AbsoluteUri);
WebResponse response = request.GetResponse();
htmlDoc.Load(response.GetResponseStream(), true);
if (htmlDoc.DocumentNode != null)
{
var articleNodes = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p");
if (articleNodes != null && articleNodes.Any())
{
foreach (var articleNode in articleNodes)
{
textBox1.AppendText(htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p").ToString());
}
}
}
Console.ReadLine();
}
private void listBox1_SelectedIndexChanged(object sender, System.EventArgs e)
{
choice = listBox1.SelectedItem.ToString();
}
}
}
那么我在这里错过了什么/做错了什么?数据应该返回如下内容:
Warren County Public Schools Closed
Washington Adventist University Closing at Noon
感谢您查看此内容。
答案 0 :(得分:0)
没关系,找到了问题。我想我试图抓住文档节点而不是内部文本......这是代码,以防万一有人想要它。
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using System;
using HtmlAgilityPack;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
string choice;
public Form1()
{
InitializeComponent();
}
public void comboBox1_SelectedIndexChanged(object sender, System.EventArgs e)
{
}
public void button1_Click(object sender, System.EventArgs e)
{
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
string urlToLoad = "http://www.nbcwashington.com/weather/school-closings/";
HttpWebRequest request = HttpWebRequest.Create(urlToLoad) as HttpWebRequest;
request.Method = "GET";
Console.WriteLine(request.RequestUri.AbsoluteUri);
WebResponse response = request.GetResponse();
htmlDoc.Load(response.GetResponseStream(), true);
if (htmlDoc.DocumentNode != null)
{
var articleNodes = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p");
if (articleNodes != null && articleNodes.Any())
{
int k = 0;
foreach (var articleNode in articleNodes)
{
textBox1.AppendText(articleNode.InnerText + "\n");
}
}
}
Console.ReadLine();
}
private void listBox1_SelectedIndexChanged(object sender, System.EventArgs e)
{
choice = listBox1.SelectedItem.ToString();
}
}
}
答案 1 :(得分:0)
由于articleNodes
已经包含您感兴趣的节点,因此无需在循环内再次调用SelectNodes()
。
此外,您不需要检查null
,因为articleNodes
是一个集合。它可能为空,但不应该是null
。
尝试此操作,改为访问InnerHtml
(或InnerText
)属性:
var articleNodes = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p");
var result = articleNodes.Select(x => x.InnerHtml.Replace("<br><span>", " ")
.Replace(" </span>", "")).ToList();