我有一个使用webbrowser控件开发的网络爬虫。有时它工作正常 - 它到达登录页面主页目录菜单页面产品详细信息页面返回目录菜单页面。但在某些时候,它只是到达主页,然后它给出一个空白页面。它没有获得所需的元素ID。
我尝试了以下给出的各种建议......他们没有帮助。有趣的是,这个问题是一个间歇性的问题。有什么想法来克服这个间歇性问题需要做些什么?
注意:由于它是一个内部网站,我将无法共享该网站的网址。
参考
CODE
public partial class Form1 : Form
{
private System.Windows.Forms.WebBrowser wb = null;
private ListBox listBox1 = null;
List<string> visitedUrls = new List<string>();
List<string> visitedProducts = new List<string>();
bool isFirstPage = true;
string clickType = String.Empty;
bool isUnvisitedProductExist = true;
private void ExerciseApp(object sender, EventArgs e)
{
#region Listbox Data Filling
if (listBox1.Items.Count == 0)
{
listBox1.Items.Add("Start--" + DateTime.Now.ToString());
}
else
{
if (listBox1.Items.Count == 2)
{
listBox1.Items.RemoveAt(1);
}
listBox1.Items.Add("Now--" + DateTime.Now.ToString());
}
#endregion
WriteLogFunction(" -----------------------------------------------");
#region Login
//Check whether login page
if (isFirstPage)
{
HtmlElement logonId = this.wb.Document.GetElementById("logonId");
HtmlElement password = this.wb.Document.GetElementById("logonPassword");
HtmlElement btnLogin = this.wb.Document.GetElementById("WC_AccountDisplay_links_2");
if (logonId != null && password != null && btnLogin != null)
{
logonId.InnerText = ConfigValues.userName;
password.InnerText = ConfigValues.passwordText;
isFirstPage = false;
//Call click for login
btnLogin.InvokeMember("click");
}
}
#endregion
bool isClickCalled = false;
#region Specific Product Details
int catalogElementIterationCounter = 0;
var elementsToConsider = wb.Document.All;
bool isMenuPage = false;
foreach (HtmlElement e1 in elementsToConsider)
{
catalogElementIterationCounter++;
string x = e1.TagName;
String idStr = e1.GetAttribute("id");
if (!String.IsNullOrWhiteSpace(idStr))
{
//Each Product Navigation
if (idStr.Contains("catalogEntry_img"))
{
isMenuPage = true;
string productUrl = e1.GetAttribute("href");
if (!visitedProducts.Contains(productUrl))
{
WriteLogFunction("p__" + productUrl);
isUnvisitedProductExist = true;
visitedProducts.Add(productUrl);
isClickCalled = true;
clickType = "Product";
e1.InvokeMember("Click");
break;
}
}
}
if (isMenuPage)
{
//Even after traversing the page, there is no unvisited product pending.
//So good to go for next page
if (catalogElementIterationCounter == elementsToConsider.Count - 1)
{
isUnvisitedProductExist = false;
}
}
}
#endregion
#region Menu Page
if (!isClickCalled)
{
#region Time Delay
try
{
DateTime start = DateTime.Now;
if (!wb.IsDisposed)
{
while (wb.ReadyState != WebBrowserReadyState.Complete)
{
System.Windows.Forms.Application.DoEvents();
if (wb.IsDisposed || DateTime.Now.Subtract(start).TotalSeconds > 2)
{
// Time limit break and dispose break
break;
}
}
}
}
catch (Exception ex)
{
WriteLogFunction(ex.Message);
//Supress the exception
}
#endregion
int menuPageIterationCounter = 0;
bool isMatchFound = false;
WriteLogFunction("Count-" + wb.Document.All.Count);
var elementsInMenuPage = wb.Document.All;
foreach (HtmlElement e1 in elementsInMenuPage)
{
menuPageIterationCounter++;
string x = e1.TagName;
String idStr = e1.GetAttribute("id");
WriteLogFunction("Before--"+idStr);
#region time Delay
try
{
DateTime start = DateTime.Now;
if (!wb.IsDisposed)
{
while (wb.ReadyState != WebBrowserReadyState.Complete)
{
System.Windows.Forms.Application.DoEvents();
if (wb.IsDisposed || DateTime.Now.Subtract(start).TotalSeconds > 50)
{
// Time limit break and dispose break
break;
}
}
}
}
catch (Exception ex)
{
WriteLogFunction(ex.Message);
//Supress the exception
}
#endregion
WriteLogFunction("After--" + idStr);
//Main Menu Item Navigation
if (idStr.Contains("WC_CachedHeaderDisplay_links"))
{
WriteLogFunction("*******INSIDE");
string url = e1.GetAttribute("href");
string latestUrl = String.Empty;
if (visitedUrls.Count > 0)
{
latestUrl = visitedUrls[visitedUrls.Count - 1];
}
WriteLogFunction("L__" + latestUrl);
WriteLogFunction("isUnvisitedProductExist__" + isUnvisitedProductExist.ToString());
if (visitedUrls.Contains(url) && isUnvisitedProductExist)
{
if (latestUrl == url)
{
isMatchFound = true;
clickType = "Menu";
WriteLogFunction("u1__" + url);
e1.InvokeMember("Click");
break;
}
}
else if (!visitedUrls.Contains(url))
{
isMatchFound = true;
//Reset visited Products
visitedProducts = new List<string>();
visitedUrls.Add(url);
clickType = "Menu";
WriteLogFunction("u2__" + url);
e1.InvokeMember("Click");
break;
}
if (!isMatchFound && (menuPageIterationCounter == elementsInMenuPage.Count - 1))
{
//wb.Navigate(websiteUrl);
//Application.Exit();
//Environment.Exit(0);
}
}
}
}
#endregion
}
public Form1()
{
// listBox1
listBox1 = new ListBox();
listBox1.Location = new Point(10, 10);
listBox1.Size = new Size(500, 50);
this.Controls.Add(listBox1);
// Web Browser
wb = new WebBrowser();
wb.Location = new Point(10, 80);
wb.Size = new Size(900, 900);
//wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(ExerciseApp);
wb.ScriptErrorsSuppressed = true;
wb.Url = new Uri(ConfigValues.websiteUrl);
// Form1
this.Text = "Web Browser Test";
this.Size = new Size(950, 950);
this.Controls.Add(wb);
this.Load += Form1_Load;
}
private void Form1_Load(object sender, EventArgs e)
{
this.wb.DocumentCompleted += delegate
{
// DocumentCompleted is fired before window.onload and body.onload
this.wb.Document.Window.AttachEventHandler("onload", delegate
{
// Defer this to make sure all possible onload event handlers got fired
System.Threading.SynchronizationContext.Current.Post(delegate
{
MessageBox.Show("window.onload was fired, can access DOM!");
ExerciseApp(null, null);
}, null);
});
};
this.wb.Navigate(ConfigValues.websiteUrl);
}
private void WriteLogFunction(string strMessage)
{
using (StreamWriter w = File.AppendText("log.txt"))
{
w.WriteLine("\r\n{0} {1} ", DateTime.Now.ToLongTimeString(), strMessage);
}
}
}