网页抓取列表网站

时间:2016-08-27 00:41:56

标签: c# .net web-scraping

我正在试图抓一个网站 - 我在其他项目上完成了这个但我似乎无法做到这一点。可能是我已经工作了2天以上,也许我错过了一些东西。请有人查看我的代码吗?这是:

using System;
using System.Collections.Generic;
using HtmlAgilityPack;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;
using System.Xml.Linq;
using System.IO;

public partial class _Default : System.Web.UI.Page
{
    List<string> names = new List<string>();
    List<string> address = new List<string>();
    List<string> number = new List<string>();
    protected void Page_Load(object sender, EventArgs e)
    {
        string url = "http://www.scoot.co.uk/find/" + "cafe" + " " + "-in-uk?page=" + "4";
        var Webget = new HtmlWeb();
        var doc = Webget.Load(url);
        List<List<string>> mainList = new List<List<string>>();

        foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h2//a"))
        {
            names.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
        }
        foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[@class='result-address']"))
        {
            address.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
        }
        foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[@class='result-number']"))
        {
            number.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
        }

        XDocument doccy = new XDocument(

new XDeclaration("1.0", "utf-8", "yes"),

new XComment("Business For Sale"),

new XElement("Data",

from data in mainList
select new XElement("data", new XAttribute("data", "data"),
new XElement("Name : ", names[0]),
new XElement("Add : ", address[0]),
new XElement("Number : ", number[0])
)
)

);

        var xml = doccy.ToString();

        Response.ContentType = "text/xml"; //Must be 'text/xml'
        Response.ContentEncoding = System.Text.Encoding.UTF8; //We'd like UTF-8
        doccy.Save(Response.Output); //Save to the text-writer

    }

}

网站列出了商家名称,电话号码和地址,它们都是由类名(结果地址,结果编号等)定义的。我正在尝试获取XML输出,以便我可以从第4页的每个列表中获取公司名称,地址和电话号码,以便明天进行演示,但我无法让它完全正常工作!

结果在每个循环的所有3个中都是正确的但它们不会在xml中输出我得到超出范围的错误。

1 个答案:

答案 0 :(得分:1)

我的第一条建议是让CodeBehind尽可能轻松。如果你用业务逻辑膨胀,那么解决方案将变得难以维护。这不是主题,但我建议查阅SOLID原则。

首先,我创建了一个自定义对象,而不是使用字符串列表,这些字符串无法知道哪个地址项与哪个名称相关联:

public class Listing
{
    public string Name { get; set; }
    public string Address { get; set; }
    public string Number { get; set; }
}

这是它的核心,一个完成所有抓取和序列化的类(我已经破坏了SOLID原则,但有时你只是想让它正常工作。)

using System.Collections.Generic;
using HtmlAgilityPack;
using System.IO;
using System.Xml;
using System.Xml.Serialization;
using System.Linq;
public class TheScraper
{
    public List<Listing> DoTheScrape(int pageNumber)
    {
        List<Listing> result = new List<Listing>();

        string url = "http://www.scoot.co.uk/find/" + "cafe" + " " + "-in-uk?page=" + pageNumber;

        var Webget = new HtmlWeb();
        var doc = Webget.Load(url);

        // select top level node, this is the closest we can get to the elements in which all the listings are a child of.
        var nodes = doc.DocumentNode.SelectNodes("//*[@id='list']/div/div/div/div");

        // loop through each child 
        if (nodes != null)
        {
            foreach (var node in nodes)
            {
                Listing listing = new Listing();

                // get each individual listing and manually check for nulls
                // listing.Name = node.SelectSingleNode("./div/div/div/div/h2/a")?.InnerText; --easier way to null check if you can use null propagating operator
                var nameNode = node.SelectSingleNode("./div/div/div/div/h2/a");
                if (nameNode != null) listing.Name = nameNode.InnerText;

                var addressNode = node.SelectSingleNode("./div/div/div/div/p[@class='result-address']");
                if (addressNode != null) listing.Address = addressNode.InnerText.Trim();

                var numberNode = node.SelectSingleNode("./div/div/div/div/p[@class='result-number']/a");
                if (numberNode != null) listing.Number = numberNode.Attributes["data-visible-number"].Value;

                result.Add(listing);
            }
        }

        // filter out the nulls
        result = result.Where(x => x.Name != null && x.Address != null && x.Number != null).ToList();

        return result;
    }

    public string SerializeTheListings(List<Listing> listings)
    {
        var xmlSerializer = new XmlSerializer(typeof(List<Listing>));

        using (var stringWriter = new StringWriter())
        using (var xmlWriter = XmlWriter.Create(stringWriter, new XmlWriterSettings { Indent = true }))
        {
            xmlSerializer.Serialize(xmlWriter, listings);
            return stringWriter.ToString();
        }
    }
}

然后你的代码看起来像这样,加上对scraper类和模型类的引用:

public partial class _Default : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {
        TheScraper scraper = new TheScraper();
        List<Listing> listings = new List<Listing>();
        // quick hack to do a loop 5 times, to get all 5 pages. if this is being run frequently you'd want to automatically identify how many pages or start at page one and find / use link to next page.
        for (int i = 0; i < 5; i++)
        {
            listings = listings.Union(scraper.DoTheScrape(i)).ToList();
        }            
        string xmlListings = scraper.SerializeTheListings(listings);
    }
}
相关问题