使用WebBrowser和单线程单元C#进行Web刮擦

时间:2014-06-14 19:27:44

标签: c# winforms web-scraping webbrowser-control async-await

这个问题是关于Noseratio代码在这个问题上略有不同的变化:[link] How to cancel Task await after a timeout period

我能够完全按原样构建他的代码,以创建一个控制台应用程序,该应用程序返回代码中指定的三个网页中每个网页的URL和OuterHtml。但是,当我在WinForms应用程序中放入相同的代码时,我得到的唯一输出是

URL:
http://example.com

这意味着代码不显示第一页的OuterHtml,并且它不显示其他两个页面中的任何一个的URL或OuterHtml。我想知道的是WinForms打破代码以及如何绕过它。

我预计,由于代码仍会创建一个新的STA,这会创建一个新的线程,因此我使用WinForm并不重要。我使用的是VS2013 Ultimate,.NET 4.5.1。

WinForm包含一个按钮。点击它由private void button1_Click(object sender, EventArgs e)处理,其身体与Noseratio代码中的Main相同。

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Microsoft.Win32;

namespace WebScraperAsync005
{
    public partial class Form1 : Form
    {

        // main logic
        static async Task ScrapSitesAsync(string[] urls, CancellationToken token)
        {
            using (var apartment = new MessageLoopApartment())
            {
                // create WebBrowser inside MessageLoopApartment
                var webBrowser = apartment.Invoke(() => new WebBrowser());
                try
                {
                    foreach (var url in urls)
                    {
                        Console.WriteLine("URL:\n" + url);

                        // cancel in 30s or when the main token is signalled
                        var navigationCts =  CancellationTokenSource.CreateLinkedTokenSource(token);

                          navigationCts.CancelAfter((int)TimeSpan.FromSeconds(30).TotalMilliseconds);
                        var navigationToken = navigationCts.Token;

                        // run the navigation task inside MessageLoopApartment
                        string html = await apartment.Run(() =>
                        webBrowser.NavigateAsync(url, navigationToken), navigationToken);

                        Console.WriteLine("HTML:\n" + html);
                    }
                }
                finally
                {
                    // dispose of WebBrowser inside MessageLoopApartment
                    apartment.Invoke(() => webBrowser.Dispose());
                }
            }
        }





        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            try
            {
                WebBrowserExt.SetFeatureBrowserEmulation(); // enable HTML5

                var cts = new   CancellationTokenSource((int)TimeSpan.FromMinutes(3).TotalMilliseconds);

                var task = ScrapSitesAsync(
                    new[] { "http://example.com", "http://example.org",   "http://example.net" },
                    cts.Token);

                task.Wait();

                Console.WriteLine("Press Enter to exit...");
                Console.ReadLine();
            }
            catch (Exception ex)
            {
                while (ex is AggregateException && ex.InnerException != null)
                    ex = ex.InnerException;
                Console.WriteLine(ex.Message);
                Environment.Exit(-1);
            }
        }
    }



    /// <summary>
    /// WebBrowserExt - WebBrowser extensions
    /// by Noseratio - https://stackoverflow.com/a/22262976/1768303
    /// </summary>
    public static class WebBrowserExt
    {
        const int POLL_DELAY = 500;

        // navigate and download 
        public static async Task<string> NavigateAsync(this WebBrowser webBrowser,     string url, CancellationToken token)
        {
            // navigate and await DocumentCompleted
            var tcs = new TaskCompletionSource<bool>();
            WebBrowserDocumentCompletedEventHandler handler = (s, arg) =>
            tcs.TrySetResult(true);

            using (token.Register(() => tcs.TrySetCanceled(), useSynchronizationContext: true))
            {
                webBrowser.DocumentCompleted += handler;
                try
                {
                    webBrowser.Navigate(url);
                    await tcs.Task; // wait for DocumentCompleted
                }
                finally
                {
                    webBrowser.DocumentCompleted -= handler;
                }
            }

            // get the root element
            var documentElement = webBrowser.Document.GetElementsByTagName("html")[0];

            // poll the current HTML for changes asynchronosly
            var html = documentElement.OuterHtml;
            while (true)
            {
                // wait asynchronously, this will throw if cancellation requested
                await Task.Delay(POLL_DELAY, token);

                // continue polling if the WebBrowser is still busy
                if (webBrowser.IsBusy)
                    continue;

                var htmlNow = documentElement.OuterHtml;
                if (html == htmlNow)
                    break; // no changes detected, end the poll loop

                html = htmlNow;
            }

            // consider the page fully rendered 
            token.ThrowIfCancellationRequested();
            return html;
        }

        // enable HTML5 (assuming we're running IE10+)
        // more info: https://stackoverflow.com/a/18333982/1768303
        public static void SetFeatureBrowserEmulation()
        {
            if (System.ComponentModel.LicenseManager.UsageMode !=     System.ComponentModel.LicenseUsageMode.Runtime)
                return;
            var appName = System.IO.Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileNa    me);
            Registry.SetValue(@"HKEY_CURRENT_USER\Software\Microsoft\Internet Explorer\Main\FeatureControl\FEATURE_BROWSER_EMULATION",
                appName, 10000, RegistryValueKind.DWord);
        }
    }

    /// <summary>
    /// MessageLoopApartment
    /// STA thread with message pump for serial execution of tasks
    /// by Noseratio - https://stackoverflow.com/a/22262976/1768303
    /// </summary>
    public class MessageLoopApartment : IDisposable
    {
        Thread _thread; // the STA thread

        TaskScheduler _taskScheduler; // the STA thread's task scheduler

        public TaskScheduler TaskScheduler { get { return _taskScheduler; } }

        /// <summary>MessageLoopApartment constructor</summary>
        public MessageLoopApartment()
        {
            var tcs = new TaskCompletionSource<TaskScheduler>();

            // start an STA thread and gets a task scheduler
            _thread = new Thread(startArg =>
            {
                EventHandler idleHandler = null;

                idleHandler = (s, e) =>
                {
                    // handle Application.Idle just once
                    Application.Idle -= idleHandler;
                    // return the task scheduler
                    tcs.SetResult(TaskScheduler.FromCurrentSynchronizationContext());
                };

                // handle Application.Idle just once
                // to make sure we're inside the message loop
                // and SynchronizationContext has been correctly installed
                Application.Idle += idleHandler;
                Application.Run();
            });

            _thread.SetApartmentState(ApartmentState.STA);
            _thread.IsBackground = true;
            _thread.Start();
            _taskScheduler = tcs.Task.Result;
        }

        /// <summary>shutdown the STA thread</summary>
        public void Dispose()
        {
            if (_taskScheduler != null)
            {
                var taskScheduler = _taskScheduler;
                _taskScheduler = null;

                // execute Application.ExitThread() on the STA thread
                Task.Factory.StartNew(
                    () => Application.ExitThread(),
                    CancellationToken.None,
                    TaskCreationOptions.None,
                    taskScheduler).Wait();

                _thread.Join();
                _thread = null;
            }
        }

        /// <summary>Task.Factory.StartNew wrappers</summary>
        public void Invoke(Action action)
        {
            Task.Factory.StartNew(action,
                CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Wait();
        }

        public TResult Invoke<TResult>(Func<TResult> action)
        {
            return Task.Factory.StartNew(action,
                CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Result;
        }

        public Task Run(Action action, CancellationToken token)
        {
            return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
        }

        public Task<TResult> Run<TResult>(Func<TResult> action, CancellationToken token)
        {
            return Task.Factory.StartNew(action, token, TaskCreationOptions.None,     _taskScheduler);
        }

        public Task Run(Func<Task> action, CancellationToken token)
        {
            return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
        }

        public Task<TResult> Run<TResult>(Func<Task<TResult>> action, CancellationToken token)
        {
            return Task.Factory.StartNew(action, token, TaskCreationOptions.None,     _taskScheduler).Unwrap();
        }
    }


}

1 个答案:

答案 0 :(得分:3)

您链接的代码专为在控制台应用或Windows服务中使用而设计。当您从WinForms应用程序中使用它时,您会遇到task.Wait()导致的死锁:

var task = ScrapSitesAsync(
    new[] { "http://example.com", "http://example.org", "http://example.net" },
    cts.Token);

task.Wait();

要了解此死锁的性质,请参阅Stephen Cleary的"Don't Block on Async Code"

最简单的解决方法(但不是正确的解决方法)可能是在此处添加ConfigureAwait(false)

// run the navigation task inside MessageLoopApartment
string html = await apartment.Run(() =>
    webBrowser.NavigateAsync(url, navigationToken), navigationToken).ConfigureAwait(false);

然而,正确的解决方法是完全摆脱MessageLoopApartment,对于已经拥有自己的消息循环的WinForms应用来说,这是多余的。然后,使用await task代替task.Wait()

这样,代码实际上变得更简单了:

namespace WebScraperAsync005
{
    public partial class Form1 : Form
    {

        // main logic
        static async Task ScrapSitesAsync(string[] urls, CancellationToken token)
        {
            using(var webBrowser = new WebBrowser())
            {
                foreach (var url in urls)
                {
                    Console.WriteLine("URL:\n" + url);

                    // cancel in 30s or when the main token is signalled
                    var navigationCts = CancellationTokenSource.CreateLinkedTokenSource(token);

                    navigationCts.CancelAfter((int)TimeSpan.FromSeconds(30).TotalMilliseconds);
                    var navigationToken = navigationCts.Token;

                    // run the navigation task inside MessageLoopApartment
                    string html = await webBrowser.NavigateAsync(url, navigationToken);

                    Console.WriteLine("HTML:\n" + html);
                }
            }
        }

        public Form1()
        {
            InitializeComponent();
        }

        private async void button1_Click(object sender, EventArgs e)
        {
            this.button1.Enabled = false;
            try
            {
                WebBrowserExt.SetFeatureBrowserEmulation(); // enable HTML5

                var cts = new CancellationTokenSource((int)TimeSpan.FromMinutes(3).TotalMilliseconds);

                await ScrapSitesAsync(
                    new[] { "http://example.com", "http://example.org", "http://example.net" },
                    cts.Token);

                MessageBox.Show("Completed.");
            }
            catch (Exception ex)
            {
                while (ex is AggregateException && ex.InnerException != null)
                    ex = ex.InnerException;
                MessageBox.Show(ex.Message);
            }
            this.button1.Enabled = true;
        }
    }

    /// <summary>
    /// WebBrowserExt - WebBrowser extensions
    /// by Noseratio - http://stackoverflow.com/a/22262976/1768303
    /// </summary>
    public static class WebBrowserExt
    {
        const int POLL_DELAY = 500;

        // navigate and download 
        public static async Task<string> NavigateAsync(this WebBrowser webBrowser, string url, CancellationToken token)
        {
            // navigate and await DocumentCompleted
            var tcs = new TaskCompletionSource<bool>();
            WebBrowserDocumentCompletedEventHandler handler = (s, arg) =>
            tcs.TrySetResult(true);

            using (token.Register(() => tcs.TrySetCanceled(), useSynchronizationContext: true))
            {
                webBrowser.DocumentCompleted += handler;
                try
                {
                    webBrowser.Navigate(url);
                    await tcs.Task; // wait for DocumentCompleted
                }
                finally
                {
                    webBrowser.DocumentCompleted -= handler;
                }
            }

            // get the root element
            var documentElement = webBrowser.Document.GetElementsByTagName("html")[0];

            // poll the current HTML for changes asynchronosly
            var html = documentElement.OuterHtml;
            while (true)
            {
                // wait asynchronously, this will throw if cancellation requested
                await Task.Delay(POLL_DELAY, token);

                // continue polling if the WebBrowser is still busy
                if (webBrowser.IsBusy)
                    continue;

                var htmlNow = documentElement.OuterHtml;
                if (html == htmlNow)
                    break; // no changes detected, end the poll loop

                html = htmlNow;
            }

            // consider the page fully rendered 
            token.ThrowIfCancellationRequested();
            return html;
        }

        // enable HTML5 (assuming we're running IE10+)
        // more info: http://stackoverflow.com/a/18333982/1768303
        public static void SetFeatureBrowserEmulation()
        {
            if (System.ComponentModel.LicenseManager.UsageMode !=     System.ComponentModel.LicenseUsageMode.Runtime)
                return;
            var appName = System.IO.Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);
            Registry.SetValue(@"HKEY_CURRENT_USER\Software\Microsoft\Internet Explorer\Main\FeatureControl\FEATURE_BROWSER_EMULATION",
                appName, 10000, RegistryValueKind.DWord);
        }
    }
}