我正在编写一个Web爬虫,它使用WebBrowser在网页中呈现javascript,然后吐出html代码进行抓取。我遇到的问题是,看起来打开的虚拟浏览器窗口没有关闭,因为在几分钟的爬行后我收到此错误:未处理的异常:System.Runtime.InteropServices.COMException:当前进程已使用其所有系统允许的窗口管理器对象的句柄 我在MVC Web项目中编写此代码:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Threading;
namespace Abot.Demo
{
// Threaded version
public class HeadlessBrowser
{
private static string GeneratedSource { get; set; }
private static string URL { get; set; }
public static string GetGeneratedHTML(string url)
{
string result = null;
ThreadStart pumpMessages = () =>
{
EventHandler idleHandler = null;
idleHandler = (s, e) =>
{
Application.Idle -= idleHandler;
WebBrowser wb = new WebBrowser();
wb.DocumentCompleted += (s2, e2) =>
{
result = wb.Document.Body.InnerHtml;
wb.Dispose();
Application.Exit();
};
try
{
wb.Navigate(url);
}
catch(Exception ex)
{
Console.WriteLine(ex.ToString());
}
};
Application.Idle += idleHandler;
Application.Run();
};
if (Thread.CurrentThread.GetApartmentState() == ApartmentState.STA)
pumpMessages();
else
{
Thread t = new Thread(pumpMessages);
t.SetApartmentState(ApartmentState.STA);
t.Start();
t.Join();
}
return result;
}
private static void WebBrowserThread()
{
WebBrowser wb = new WebBrowser();
try
{
wb.Navigate(URL);
}
catch(Exception exc)
{
Console.WriteLine(exc);
}
wb.DocumentCompleted +=
new WebBrowserDocumentCompletedEventHandler(
wb_DocumentCompleted);
while (wb.ReadyState != WebBrowserReadyState.Complete);
//Added this line, because the final HTML takes a while to show up
GeneratedSource = wb.Document.Body.InnerHtml;
wb.Dispose();
wb.Stop();
}
private static void wb_DocumentCompleted(object sender,
WebBrowserDocumentCompletedEventArgs e)
{
WebBrowser wb = (WebBrowser)sender;
GeneratedSource = wb.Document.Body.InnerHtml;
}
}
}