将数据从HTML表导入到C#中的DataTable

时间:2013-08-06 21:14:32

标签: c# html datatable html-agility-pack

我想从HTML表格中导入一些数据(这里是一个链接http://road2paris.com/wp-content/themes/roadtoparis/api/generated_table_august.html),并在我的表单应用程序中显示DataGridView中的前16个人。从我读到的最好的方法是使用HTML Agility包,所以我下载并包含在我的项目中。我知道首先要做的是加载html文件的内容。这是我过去常用的代码:

        string htmlCode = "";
        using (WebClient client = new WebClient())
        {
            client.Headers.Add(HttpRequestHeader.UserAgent, "AvoidError");
            htmlCode = client.DownloadString("http://road2paris.com/wp-content/themes/roadtoparis/api/generated_table_august.html");
        }
        HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

        doc.LoadHtml(htmlCode);

然后我卡住了。我不知道如何使用html表中的数据填充我的数据表。我尝试了很多不同的解决方案,但似乎没有什么工作正常。如果有人能帮助我,我会很高兴。

2 个答案:

答案 0 :(得分:12)

HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlCode);
var headers = doc.DocumentNode.SelectNodes("//tr/th");
DataTable table = new DataTable();
foreach (HtmlNode header in headers)
    table.Columns.Add(header.InnerText); // create columns from th
// select rows with td elements 
foreach (var row in doc.DocumentNode.SelectNodes("//tr[td]")) 
    table.Rows.Add(row.SelectNodes("td").Select(td => td.InnerText).ToArray());

您需要HTML Agility Pack库才能使用此代码。

答案 1 :(得分:0)

下面,我创建了将防止重复的数据头的代码。创建数据表时,每个“列”必须具有唯一的名称。此外,有时HTML行可能超出范围,您必须向数据表中添加其他列,否则将删除数据。这就是我的解决方案。

'''
public enum DuplicateHeaderReplacementStrategy
{
    AppendAlpha,
    AppendNumeric,
    Delete
}

public class HtmlServices
{
    private static readonly string[] Alpha = new[] { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" };

    public static HtmlDocument RenameDuplicateHeaders(HtmlDocument doc, DuplicateHeaderReplacementStrategy strategy)
    {
        var index = 0;
        try
        {
            foreach (HtmlNode table in doc.DocumentNode?.SelectNodes("//table"))
            {
                var tableHeaders = table.SelectNodes("th")?
                   .GroupBy(x => x)?
                   .Where(g => g.Count() > 1)?
                   .ToList();
                tableHeaders?.ForEach(y =>
                   {
                       switch (strategy)
                       {
                           case DuplicateHeaderReplacementStrategy.AppendNumeric:
                               y.Key.InnerHtml += index++;
                               break;

                           case DuplicateHeaderReplacementStrategy.AppendAlpha:
                               y.Key.InnerHtml += Alpha[index++];
                               break;

                           case DuplicateHeaderReplacementStrategy.Delete:
                               y.Key.InnerHtml = string.Empty;
                               break;
                       }
                });
            }
            return doc;
        }
        catch
        {
            return doc;
        }


    }
}


public static DataTable GetDataTableFromHtmlTable(string url, string[] htmlIds)
    {
        ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12;
        HtmlWeb web = new HtmlWeb();
        HtmlDocument doc = web.Load(url);
        string html = doc.DocumentNode.OuterHtml;

        doc = HtmlServices.RenameDuplicateHeaders(doc, DuplicateHeaderReplacementStrategy.AppendNumeric);

        var headers = doc.DocumentNode.SelectNodes("//tr/th");

        DataTable table = new DataTable();
        foreach (HtmlNode header in headers)
            if (!table.ColumnExists(header.InnerText))
            {
                table.Columns.Add(header.InnerText); // create columns from th
            }
            else
            {
                int columnIteration = 0;
                while (table.ColumnExists(header.InnerText + columnIteration.ToString()))
                {
                    columnIteration++;
                }
                table.Columns.Add(header.InnerText + columnIteration.ToString()); // create columns from th
            }

        // select rows with td elements
        foreach (var row in doc.DocumentNode.SelectNodes("//tr[td]"))
        {
            var addRow = row.SelectNodes("td").Select(td => td.InnerHtml.StripHtmlTables()).ToArray();

            if (addRow.Count() > table.Columns.Count)
            {
                int m_numberOfRowsToAdd = addRow.Count() - table.Columns.Count;
                for (int i = 0; i < m_numberOfRowsToAdd; i++)
                    table.Columns.Add($"ExtraColumn {i + 1}");
            }

            try
            {
                table.Rows.Add(addRow);
            }
            catch (Exception e)
            {
                debug.Print(e.Message);
            }
        }
        return table;
    }