以编程方式下载完整网页

时间:2017-01-16 18:31:04

标签: c# html webclient webclient-download

我一直遇到使用WebClient自动下载网页的问题。以下是我的代码贯穿的步骤:

  1. 将HTML检索为字符串
  2. 通过字符串迭代,检索有效的内容网址(.js,.css,.png等)
  3. 下载内容
  4. 将HTML字符串中的网址替换为内容的本地文件路径。
  5. 将新的HTML字符串保存到“main.html”。
  6. 一切都下载得很好。当我尝试在Chrome中打开html文件时,它是一个空白加载屏幕(从10秒到30秒不等)。当页面最终加载时,它看起来像基本文本和破碎的内容。

    Chrome开发者工具中的错误表明很多.js和.css文件都不存在,即使我已经验证了大多数文件位于指定目录中。 enter image description here

    我尝试了多个网站,每个网站都有相同的结果。

    以下是检索html数据的代码:

        public string ScanPage(string url)
        {
            Console.WriteLine("Scanning url [" + url + "].");
            WebClient client = new WebClient();
    
            client.Headers.Add("user-agent", userAgent);
            client.Headers.Add(HttpRequestHeader.ContentType, "text/html");
    
            string page = string.Empty;
    
            try
            {
                page = client.DownloadString(url);
                Console.WriteLine("Webpage has been scanned.");
            }
            catch (Exception e)
            {
                Console.WriteLine("Error scanning page: " + e.Message);
            }
    
            client.Dispose();
            return page;
        }
    

    开始下载数据。首先调用此方法。

        public void DownloadPageContent(string url, string contentDirectory, params string[] customExtensions)
        {
            //PathSafeURL(url) takes the url and removes unsafe characters
            contentDirectory += PathSafeURL(url);
            if (Directory.Exists(contentDirectory))
                Directory.Delete(contentDirectory, true);
    
            if (!Directory.Exists(contentDirectory))
                Directory.CreateDirectory(contentDirectory);
    
            Uri uri = new Uri(url);
            string host = uri.Host;
    
            //PageResponse is used to check for valid URLs. Irrelevant to the issue.
            PageResponse urlResponse = CheckHttpPageResponse(url);
            if (urlResponse.IsSuccessful())
            {
                //Get the html page as a string.
                string data = ScanPage(url);
    
                if (!string.IsNullOrEmpty(data))
                {
                    //Download files with ".js" extension.
                    DownloadByExtension(ref data, ".js", contentDirectory + "/", "scripts/", host);
    
                    //Same as above, but with .css files.
                    DownloadByExtension(ref data, ".css", contentDirectory + "/", "css/", host);
    
                    //Iterate through custom extensions (.png, .jpg, .webm, etc.)
                    for (int i = 0; i < customExtensions.Length; i++)
                        DownloadByExtension(ref data, customExtensions[i], contentDirectory + "/", "resources/", host);
    
                    string documentDirectory = contentDirectory + "/main.html";
                    File.Create(documentDirectory).Dispose();
                    File.AppendAllText(documentDirectory, data);
    
                    Console.WriteLine("Page download has completed.");
                }
                else
                    Console.WriteLine("Error retrieving page data. Data was empty.");
            }
            else
                Console.WriteLine("Page could not be loaded. " + urlResponse.ToString());
        }
    
        public void DownloadByExtension(ref string data, string extension, string contentDirectory, string subDirectory, string host)
        {
            List<HtmlContent> content = new List<HtmlContent>();
            IterateContentLinks(data, extension, ref content, host);
            CreateContent(contentDirectory, subDirectory, content);
    
            for (int i = 0; i < content.Count; i++)
                data = data.Replace(content[i].OriginalText + content[i].Extension, content[i].LocalLink);
    
            Console.WriteLine("Downloaded " + content.Count + " " + extension + " files.");
            Console.WriteLine();
        }
        private void IterateContentLinks(string data, string extension, ref List<HtmlContent> content, string host)
        {
            int totalCount = data.TotalCharacters(extension + "\"");
            for (int i = 1; i < totalCount + 1; i++)
            {
                int extensionIndex = data.IndexOfNth(extension + "\"", i);
                int backTrackIndex = extensionIndex - 1;
    
                //Backtrack index from the extension index until you reach the first quotation mark.
                while (data[backTrackIndex] != '"')
                {
                    backTrackIndex -= 1;
                }
    
                string text = data.Substring(backTrackIndex + 1, (extensionIndex - backTrackIndex) - 1);
                string link = text;
    
                if (link.StartsWith("//"))
                    link = link.Insert(0, "http:");
                if (link.StartsWith("/"))
                    link = link.Insert(0, "http://" + host);
                if (!link.Contains("/")) //Assume it's in a "test.jpg" format.
                    link = link.Insert(0, "http://" + host + "/");
    
                content.Add(new HtmlContent(text, link, extension));
            }
    
            //Remove repeating links
            for (int i = 0; i < content.Count; i++)
            {
                for (int j = i + 1; j < content.Count; j++)
                {
                    if (content[i].OriginalText == content[j].OriginalText)
                        content.Remove(content[i]);
                }
            }
        }
        private void CreateContent(string contentDirectory, string subDirectory, List<HtmlContent> content)
        {
            if (!Directory.Exists(contentDirectory + subDirectory))
                Directory.CreateDirectory(contentDirectory + subDirectory);
    
            Random random = new Random(Guid.NewGuid().GetHashCode());
    
            for (int i = 0; i < content.Count; i++)
            {
                content[i].RandomName = Extensions.RandomSymbols(random, 20, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890");
                content[i].LocalLink = contentDirectory + subDirectory + content[i].RandomName + content[i].Extension;
    
                bool isSuccessful = false;
                DownloadFile(content[i].DownloadLink + content[i].Extension, content[i].LocalLink, ref isSuccessful);
    
                if (isSuccessful == false)
                    content.Remove(content[i]);
            }
        }
        private void DownloadFile(string url, string filePath, ref bool isSuccessful)
        {
            using (WebClient client = new WebClient())
            {
                client.Headers.Add("user-agent", userAgent);
                //client.Headers.Add(HttpRequestHeader.ContentType, "image/jpg");
    
                try
                {
                    client.DownloadFile(url, filePath);
                    isSuccessful = true;
                }
                catch
                {
                    isSuccessful = false;
                    Console.WriteLine("File [" + url + "] could not be downloaded.");
                }
            }
        }
    

    HTMLContent类:

    public class HtmlContent
    {
        public string OriginalText { get; private set; }
        public string DownloadLink { get; private set; }
        public string Extension { get; private set; }
    
        public string LocalLink { get; set; }
        public string RandomName { get; set; }
    
        public HtmlContent(string OriginalText, string DownloadLink, string Extension)
        {
            this.OriginalText = OriginalText;
            this.DownloadLink = DownloadLink;
            this.Extension = Extension;
        }
    }
    

    作为文件下载程序,这很好用。对于HTML下载,它也很好。但作为一个完整的离线网页下载程序,它没有。

    编辑:

    不确定是否重要,但我忘了显示userAgent变量的样子:

    private const string userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.33 Safari/537.36";
    

0 个答案:

没有答案