我正在试图抓一个网站 - 我在其他项目上完成了这个但我似乎无法做到这一点。可能是我已经工作了2天以上,也许我错过了一些东西。请有人查看我的代码吗?这是:
using System;
using System.Collections.Generic;
using HtmlAgilityPack;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;
using System.Xml.Linq;
using System.IO;
public partial class _Default : System.Web.UI.Page
{
List<string> names = new List<string>();
List<string> address = new List<string>();
List<string> number = new List<string>();
protected void Page_Load(object sender, EventArgs e)
{
string url = "http://www.scoot.co.uk/find/" + "cafe" + " " + "-in-uk?page=" + "4";
var Webget = new HtmlWeb();
var doc = Webget.Load(url);
List<List<string>> mainList = new List<List<string>>();
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h2//a"))
{
names.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
}
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[@class='result-address']"))
{
address.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
}
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[@class='result-number']"))
{
number.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
}
XDocument doccy = new XDocument(
new XDeclaration("1.0", "utf-8", "yes"),
new XComment("Business For Sale"),
new XElement("Data",
from data in mainList
select new XElement("data", new XAttribute("data", "data"),
new XElement("Name : ", names[0]),
new XElement("Add : ", address[0]),
new XElement("Number : ", number[0])
)
)
);
var xml = doccy.ToString();
Response.ContentType = "text/xml"; //Must be 'text/xml'
Response.ContentEncoding = System.Text.Encoding.UTF8; //We'd like UTF-8
doccy.Save(Response.Output); //Save to the text-writer
}
}
网站列出了商家名称,电话号码和地址,它们都是由类名(结果地址,结果编号等)定义的。我正在尝试获取XML输出,以便我可以从第4页的每个列表中获取公司名称,地址和电话号码,以便明天进行演示,但我无法让它完全正常工作!
结果在每个循环的所有3个中都是正确的但它们不会在xml中输出我得到超出范围的错误。
答案 0 :(得分:1)
我的第一条建议是让CodeBehind尽可能轻松。如果你用业务逻辑膨胀,那么解决方案将变得难以维护。这不是主题,但我建议查阅SOLID原则。
首先,我创建了一个自定义对象,而不是使用字符串列表,这些字符串无法知道哪个地址项与哪个名称相关联:
public class Listing
{
public string Name { get; set; }
public string Address { get; set; }
public string Number { get; set; }
}
这是它的核心,一个完成所有抓取和序列化的类(我已经破坏了SOLID原则,但有时你只是想让它正常工作。)
using System.Collections.Generic;
using HtmlAgilityPack;
using System.IO;
using System.Xml;
using System.Xml.Serialization;
using System.Linq;
public class TheScraper
{
public List<Listing> DoTheScrape(int pageNumber)
{
List<Listing> result = new List<Listing>();
string url = "http://www.scoot.co.uk/find/" + "cafe" + " " + "-in-uk?page=" + pageNumber;
var Webget = new HtmlWeb();
var doc = Webget.Load(url);
// select top level node, this is the closest we can get to the elements in which all the listings are a child of.
var nodes = doc.DocumentNode.SelectNodes("//*[@id='list']/div/div/div/div");
// loop through each child
if (nodes != null)
{
foreach (var node in nodes)
{
Listing listing = new Listing();
// get each individual listing and manually check for nulls
// listing.Name = node.SelectSingleNode("./div/div/div/div/h2/a")?.InnerText; --easier way to null check if you can use null propagating operator
var nameNode = node.SelectSingleNode("./div/div/div/div/h2/a");
if (nameNode != null) listing.Name = nameNode.InnerText;
var addressNode = node.SelectSingleNode("./div/div/div/div/p[@class='result-address']");
if (addressNode != null) listing.Address = addressNode.InnerText.Trim();
var numberNode = node.SelectSingleNode("./div/div/div/div/p[@class='result-number']/a");
if (numberNode != null) listing.Number = numberNode.Attributes["data-visible-number"].Value;
result.Add(listing);
}
}
// filter out the nulls
result = result.Where(x => x.Name != null && x.Address != null && x.Number != null).ToList();
return result;
}
public string SerializeTheListings(List<Listing> listings)
{
var xmlSerializer = new XmlSerializer(typeof(List<Listing>));
using (var stringWriter = new StringWriter())
using (var xmlWriter = XmlWriter.Create(stringWriter, new XmlWriterSettings { Indent = true }))
{
xmlSerializer.Serialize(xmlWriter, listings);
return stringWriter.ToString();
}
}
}
然后你的代码看起来像这样,加上对scraper类和模型类的引用:
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
TheScraper scraper = new TheScraper();
List<Listing> listings = new List<Listing>();
// quick hack to do a loop 5 times, to get all 5 pages. if this is being run frequently you'd want to automatically identify how many pages or start at page one and find / use link to next page.
for (int i = 0; i < 5; i++)
{
listings = listings.Union(scraper.DoTheScrape(i)).ToList();
}
string xmlListings = scraper.SerializeTheListings(listings);
}
}