我正在尝试使用Abot
制作C#网络抓取工具我跟着QuickStart Tutorial,但我似乎无法使其发挥作用。
方法crawler_ProcessPageCrawlCompleted
中有一个未处理的异常,正好在这一行:
if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
{
Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
}
因为crawledPage.HttpWebResponse
为空。
我可能错过了什么,但是什么?
我按照教程的建议编辑了我的app.config文件,这是我的类(引用Abot.dll):
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Abot.Crawler;
using Abot.Poco;
using System.Net;
using System.Windows.Forms; // for HttpStatusCode
namespace WebCrawler
{
public class MyCrawler
{
public MyCrawler()
{
}
public PoliteWebCrawler crawler;
public void initialize()
{
// 3. Create an instance of Abot.Crawler.PoliteWebCrawler
// 3.2 Will use app.config for confguration
// because I choose 2.1 === edited app.config
crawler = new PoliteWebCrawler();
// 4. Register for events and create processing methods (both synchronous and asynchronous versions available)
crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
#region(Step 5. Add custom objects to crawl bag ?)
//5. Add any number of custom objects to the dynamic crawl bag. These objects will be available in the CrawlContext.CrawlBag object.
// ???
/*
PoliteWebCrawler crawler = new PoliteWebCrawler();
crawler.CrawlBag.MyFoo1 = new Foo();
crawler.CrawlBag.MyFoo2 = new Foo();
crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
{
//Get your Foo instances from the CrawlContext object
CrawlContext context = e.CrawlContext;
context.CrawlBag.MyFoo1.Bar();
context.CrawlBag.MyFoo2.Bar();
}
*/
#endregion
}// initialize()
public void doCrawl()
{
CrawlResult result = crawler.Crawl(new Uri("http://yahoo.com"));
if (result.ErrorOccurred)
{
/* line 60 : */ // Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorMessage);
// I commented out because it outputs the error : 'Abot.Poco.CrawlResult' does not contain a definition for 'ErrorMessage'
}
else
{
Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
}
}
void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
Console.WriteLine("About to crawl link {0} which was found on page {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri);
}
void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
{
CrawledPage crawledPage = e.CrawledPage;
if (crawledPage.HttpWebResponse == null)
{
MessageBox.Show("HttpWebResponse null");
}
/* line 84 : */ if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
else
Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
if (string.IsNullOrEmpty(crawledPage.RawContent))
Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
}
void crawler_PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e)
{
CrawledPage crawledPage = e.CrawledPage;
Console.WriteLine("Did not crawl the links on page {0} due to {1}", crawledPage.Uri.AbsoluteUri, e.DisallowedReason);
}
void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
Console.WriteLine("Did not crawl page {0} due to {1}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason);
}
}// end of public class MyCrawler
}
错误在第84行。
此外,第60行还有一个额外的细节(可能表明我缺少了什么),这是:
'Abot.Poco.CrawlResult' does not contain a definition for 'ErrorMessage' and no extension method 'ErrorMessage' accepting a first argument of type 'Abot.Poco.CrawlResult' could be found (are you missing a using directive or an assembly reference?)
感谢您的帮助!
答案 0 :(得分:0)
这意味着您遇到的网址没有响应http请求(即......它不像http://shhdggdhshshhsjsjj.com那样存在)。这可能会导致HttpWebResponse和WebException属性都为null。