我想从HTML表格中导入一些数据(这里是一个链接http://road2paris.com/wp-content/themes/roadtoparis/api/generated_table_august.html),并在我的表单应用程序中显示DataGridView中的前16个人。从我读到的最好的方法是使用HTML Agility包,所以我下载并包含在我的项目中。我知道首先要做的是加载html文件的内容。这是我过去常用的代码:
string htmlCode = "";
using (WebClient client = new WebClient())
{
client.Headers.Add(HttpRequestHeader.UserAgent, "AvoidError");
htmlCode = client.DownloadString("http://road2paris.com/wp-content/themes/roadtoparis/api/generated_table_august.html");
}
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(htmlCode);
然后我卡住了。我不知道如何使用html表中的数据填充我的数据表。我尝试了很多不同的解决方案,但似乎没有什么工作正常。如果有人能帮助我,我会很高兴。
答案 0 :(得分:12)
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlCode);
var headers = doc.DocumentNode.SelectNodes("//tr/th");
DataTable table = new DataTable();
foreach (HtmlNode header in headers)
table.Columns.Add(header.InnerText); // create columns from th
// select rows with td elements
foreach (var row in doc.DocumentNode.SelectNodes("//tr[td]"))
table.Rows.Add(row.SelectNodes("td").Select(td => td.InnerText).ToArray());
您需要HTML Agility Pack库才能使用此代码。
答案 1 :(得分:0)
下面,我创建了将防止重复的数据头的代码。创建数据表时,每个“列”必须具有唯一的名称。此外,有时HTML行可能超出范围,您必须向数据表中添加其他列,否则将删除数据。这就是我的解决方案。
'''
public enum DuplicateHeaderReplacementStrategy
{
AppendAlpha,
AppendNumeric,
Delete
}
public class HtmlServices
{
private static readonly string[] Alpha = new[] { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" };
public static HtmlDocument RenameDuplicateHeaders(HtmlDocument doc, DuplicateHeaderReplacementStrategy strategy)
{
var index = 0;
try
{
foreach (HtmlNode table in doc.DocumentNode?.SelectNodes("//table"))
{
var tableHeaders = table.SelectNodes("th")?
.GroupBy(x => x)?
.Where(g => g.Count() > 1)?
.ToList();
tableHeaders?.ForEach(y =>
{
switch (strategy)
{
case DuplicateHeaderReplacementStrategy.AppendNumeric:
y.Key.InnerHtml += index++;
break;
case DuplicateHeaderReplacementStrategy.AppendAlpha:
y.Key.InnerHtml += Alpha[index++];
break;
case DuplicateHeaderReplacementStrategy.Delete:
y.Key.InnerHtml = string.Empty;
break;
}
});
}
return doc;
}
catch
{
return doc;
}
}
}
public static DataTable GetDataTableFromHtmlTable(string url, string[] htmlIds)
{
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12;
HtmlWeb web = new HtmlWeb();
HtmlDocument doc = web.Load(url);
string html = doc.DocumentNode.OuterHtml;
doc = HtmlServices.RenameDuplicateHeaders(doc, DuplicateHeaderReplacementStrategy.AppendNumeric);
var headers = doc.DocumentNode.SelectNodes("//tr/th");
DataTable table = new DataTable();
foreach (HtmlNode header in headers)
if (!table.ColumnExists(header.InnerText))
{
table.Columns.Add(header.InnerText); // create columns from th
}
else
{
int columnIteration = 0;
while (table.ColumnExists(header.InnerText + columnIteration.ToString()))
{
columnIteration++;
}
table.Columns.Add(header.InnerText + columnIteration.ToString()); // create columns from th
}
// select rows with td elements
foreach (var row in doc.DocumentNode.SelectNodes("//tr[td]"))
{
var addRow = row.SelectNodes("td").Select(td => td.InnerHtml.StripHtmlTables()).ToArray();
if (addRow.Count() > table.Columns.Count)
{
int m_numberOfRowsToAdd = addRow.Count() - table.Columns.Count;
for (int i = 0; i < m_numberOfRowsToAdd; i++)
table.Columns.Add($"ExtraColumn {i + 1}");
}
try
{
table.Rows.Add(addRow);
}
catch (Exception e)
{
debug.Print(e.Message);
}
}
return table;
}