我的程序是一个webcrawler。 我试图从网站下载图像。 在我的webcrawler网站上,我做了:
try
{
HtmlAgilityPack.HtmlDocument doc = TimeOut.getHtmlDocumentWebClient(mainUrl, false, "", 0, "", "");
if (doc == null)
{
if (wccfg.downloadcontent == true)
{
retwebcontent.retrieveImages(mainUrl);
}
failed = true;
wccfg.failedUrls++;
failed = false;
}
例如,当doc为null时,mainUrl包含:
http://members.tripod.com/~VanessaWest/bundybowman2.jpg
现在它跳转到另一个类中的retrieveImages方法:
namespace GatherLinks
{
class RetrieveWebContent
{
HtmlAgilityPack.HtmlDocument doc;
string imgg;
int images;
public RetrieveWebContent()
{
images = 0;
}
public List<string> retrieveImages(string address)
{
try
{
doc = new HtmlAgilityPack.HtmlDocument();
System.Net.WebClient wc = new System.Net.WebClient();
List<string> imgList = new List<string>();
doc.Load(wc.OpenRead(address));
HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img[@src]");
if (imgs == null) return new List<string>();
foreach (HtmlNode img in imgs)
{
if (img.Attributes["src"] == null)
continue;
HtmlAttribute src = img.Attributes["src"];
imgList.Add(src.Value);
if (src.Value.StartsWith("http") || src.Value.StartsWith("https") || src.Value.StartsWith("www"))
{
images++;
string[] arr = src.Value.Split('/');
imgg = arr[arr.Length - 1];
//imgg = Path.GetFileName(new Uri(src.Value).LocalPath);
//wc.DownloadFile(src.Value, @"d:\MyImages\" + imgg);
wc.DownloadFile(src.Value, "d:\\MyImages\\" + Guid.NewGuid() + ".jpg");
}
}
return imgList;
}
catch
{
Logger.Write("There Was Problem Downloading The Image: " + imgg);
return null;
}
}
}
}
现在我正在逐行使用断点和步骤,并在执行此行之后:
HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img[@src]");
变量imgs为null。 然后在下一行检查它的null是否跳到最后并且什么都不做。
我如何解决它,以便能够从http://members.tripod.com/~VanessaWest/bundybowman2.jpg下载图像?
编辑**
public List<string> retrieveImages(string address)
{
try
{
doc = new HtmlAgilityPack.HtmlDocument();
System.Net.WebClient wc = new System.Net.WebClient();
List<string> imgList = new List<string>();
doc.Load(wc.OpenRead(address));
string t = doc.DocumentNode.InnerText;
HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img//[@src]");
if (imgs == null) return new List<string>();
foreach (HtmlNode img in imgs)
{
if (img.Attributes["src"] == null)
continue;
HtmlAttribute src = img.Attributes["src"];
wc.DownloadFile(src.Value, "d:\\MyImages\\" + Guid.NewGuid() + ".jpg");
imgList.Add(src.Value);
if (src.Value.StartsWith("http") || src.Value.StartsWith("https") || src.Value.StartsWith("www"))
{
images++;
string[] arr = src.Value.Split('/');
imgg = arr[arr.Length - 1];
//imgg = Path.GetFileName(new Uri(src.Value).LocalPath);
//wc.DownloadFile(src.Value, @"d:\MyImages\" + imgg);
wc.DownloadFile(src.Value, "d:\\MyImages\\" + Guid.NewGuid() + ".jpg");
}
}
return imgList;
}
catch
{
Logger.Write("There Was Problem Downloading The Image: " + imgg);
return null;
}
}
答案 0 :(得分:1)
如果您查看WebClient返回给您的内部数据,您会看到没有Html页面,但是图像的位数据。
doc.Load(wc.OpenRead(address));
Console.WriteLine(doc.DocumentNode.InnerText);