我一直遇到使用WebClient自动下载网页的问题。以下是我的代码贯穿的步骤:
一切都下载得很好。当我尝试在Chrome中打开html文件时,它是一个空白加载屏幕(从10秒到30秒不等)。当页面最终加载时,它看起来像基本文本和破碎的内容。
Chrome开发者工具中的错误表明很多.js和.css文件都不存在,即使我已经验证了大多数文件位于指定目录中。
我尝试了多个网站,每个网站都有相同的结果。
以下是检索html数据的代码:
public string ScanPage(string url)
{
Console.WriteLine("Scanning url [" + url + "].");
WebClient client = new WebClient();
client.Headers.Add("user-agent", userAgent);
client.Headers.Add(HttpRequestHeader.ContentType, "text/html");
string page = string.Empty;
try
{
page = client.DownloadString(url);
Console.WriteLine("Webpage has been scanned.");
}
catch (Exception e)
{
Console.WriteLine("Error scanning page: " + e.Message);
}
client.Dispose();
return page;
}
开始下载数据。首先调用此方法。
public void DownloadPageContent(string url, string contentDirectory, params string[] customExtensions)
{
//PathSafeURL(url) takes the url and removes unsafe characters
contentDirectory += PathSafeURL(url);
if (Directory.Exists(contentDirectory))
Directory.Delete(contentDirectory, true);
if (!Directory.Exists(contentDirectory))
Directory.CreateDirectory(contentDirectory);
Uri uri = new Uri(url);
string host = uri.Host;
//PageResponse is used to check for valid URLs. Irrelevant to the issue.
PageResponse urlResponse = CheckHttpPageResponse(url);
if (urlResponse.IsSuccessful())
{
//Get the html page as a string.
string data = ScanPage(url);
if (!string.IsNullOrEmpty(data))
{
//Download files with ".js" extension.
DownloadByExtension(ref data, ".js", contentDirectory + "/", "scripts/", host);
//Same as above, but with .css files.
DownloadByExtension(ref data, ".css", contentDirectory + "/", "css/", host);
//Iterate through custom extensions (.png, .jpg, .webm, etc.)
for (int i = 0; i < customExtensions.Length; i++)
DownloadByExtension(ref data, customExtensions[i], contentDirectory + "/", "resources/", host);
string documentDirectory = contentDirectory + "/main.html";
File.Create(documentDirectory).Dispose();
File.AppendAllText(documentDirectory, data);
Console.WriteLine("Page download has completed.");
}
else
Console.WriteLine("Error retrieving page data. Data was empty.");
}
else
Console.WriteLine("Page could not be loaded. " + urlResponse.ToString());
}
public void DownloadByExtension(ref string data, string extension, string contentDirectory, string subDirectory, string host)
{
List<HtmlContent> content = new List<HtmlContent>();
IterateContentLinks(data, extension, ref content, host);
CreateContent(contentDirectory, subDirectory, content);
for (int i = 0; i < content.Count; i++)
data = data.Replace(content[i].OriginalText + content[i].Extension, content[i].LocalLink);
Console.WriteLine("Downloaded " + content.Count + " " + extension + " files.");
Console.WriteLine();
}
private void IterateContentLinks(string data, string extension, ref List<HtmlContent> content, string host)
{
int totalCount = data.TotalCharacters(extension + "\"");
for (int i = 1; i < totalCount + 1; i++)
{
int extensionIndex = data.IndexOfNth(extension + "\"", i);
int backTrackIndex = extensionIndex - 1;
//Backtrack index from the extension index until you reach the first quotation mark.
while (data[backTrackIndex] != '"')
{
backTrackIndex -= 1;
}
string text = data.Substring(backTrackIndex + 1, (extensionIndex - backTrackIndex) - 1);
string link = text;
if (link.StartsWith("//"))
link = link.Insert(0, "http:");
if (link.StartsWith("/"))
link = link.Insert(0, "http://" + host);
if (!link.Contains("/")) //Assume it's in a "test.jpg" format.
link = link.Insert(0, "http://" + host + "/");
content.Add(new HtmlContent(text, link, extension));
}
//Remove repeating links
for (int i = 0; i < content.Count; i++)
{
for (int j = i + 1; j < content.Count; j++)
{
if (content[i].OriginalText == content[j].OriginalText)
content.Remove(content[i]);
}
}
}
private void CreateContent(string contentDirectory, string subDirectory, List<HtmlContent> content)
{
if (!Directory.Exists(contentDirectory + subDirectory))
Directory.CreateDirectory(contentDirectory + subDirectory);
Random random = new Random(Guid.NewGuid().GetHashCode());
for (int i = 0; i < content.Count; i++)
{
content[i].RandomName = Extensions.RandomSymbols(random, 20, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890");
content[i].LocalLink = contentDirectory + subDirectory + content[i].RandomName + content[i].Extension;
bool isSuccessful = false;
DownloadFile(content[i].DownloadLink + content[i].Extension, content[i].LocalLink, ref isSuccessful);
if (isSuccessful == false)
content.Remove(content[i]);
}
}
private void DownloadFile(string url, string filePath, ref bool isSuccessful)
{
using (WebClient client = new WebClient())
{
client.Headers.Add("user-agent", userAgent);
//client.Headers.Add(HttpRequestHeader.ContentType, "image/jpg");
try
{
client.DownloadFile(url, filePath);
isSuccessful = true;
}
catch
{
isSuccessful = false;
Console.WriteLine("File [" + url + "] could not be downloaded.");
}
}
}
HTMLContent类:
public class HtmlContent
{
public string OriginalText { get; private set; }
public string DownloadLink { get; private set; }
public string Extension { get; private set; }
public string LocalLink { get; set; }
public string RandomName { get; set; }
public HtmlContent(string OriginalText, string DownloadLink, string Extension)
{
this.OriginalText = OriginalText;
this.DownloadLink = DownloadLink;
this.Extension = Extension;
}
}
作为文件下载程序,这很好用。对于HTML下载,它也很好。但作为一个完整的离线网页下载程序,它没有。
编辑:
不确定是否重要,但我忘了显示userAgent变量的样子:
private const string userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.33 Safari/537.36";