我是网络抓取的新手,我想得到一个“id”,我希望webbrowser使用这个“id”导航下一页并做我想做的事情。例如;
Go to https://www.example.com/
then get "id"
navigate https://www.example.com/id
then get title name
Go to https://www.example.com/
then get "second id"
navigate https://www.example.com/id
then get title name
...
我如何在c#上实现这一目标?
注意:此网站具有“安全超文本传输协议(https)”
编辑:当网址导航到www.example.com/id
时,DocumentCompleted会触发两次 [STAThread]
static void Main(string[] args)
{
WB = new WebBrowser();
WB.AllowNavigation = true;
WB.ScriptErrorsSuppressed = true;
WB.Navigate("https://www.example.com/page");
WB.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(WB_DocumentCompleted);
while (completed)
{
Application.DoEvents();
}
}
static async void WB_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
System.Windows.Forms.HtmlDocument doc = WB.Document;
var docy = new HtmlAgilityPack.HtmlDocument();
docy.Load(new StringReader(WB.Document.Body.InnerHtml));
HtmlElementCollection divs = doc.GetElementsByTagName("td");
if (WB.Url.ToString().IndexOf("page") > -1)
{
HtmlNodeCollection prens = docy.DocumentNode.SelectNodes(".//*[@id='mid']//dıv//dıv//table//tbody//tr//td//a");
for (int i = 0; i < prens.Count; i++)
{
HtmlNode nodes= docy.DocumentNode.SelectSingleNode(".//*[@id='mid']//dıv[6]//dıv//table[2]//tbody//tr[" + satir + "]//td[2]//a");
HtmlNode links = nodes;
if (links != null)
{
hrefValue = links.GetAttributeValue("href", string.Empty);
string[] gelenevent = hrefValue .Split('.');
eventid = gelenevent[0].Remove(0, 1);
satir++;
while (WB.IsBusy)
Application.DoEvents();
for (int y = 0; y < 500; y++)
if (WB.ReadyState != WebBrowserReadyState.Complete)
{
Application.DoEvents();
await Task.Delay(5000);
Thread.Sleep(10);
}
else
break;
Application.DoEvents();
WB.Navigate(new Uri("https://www.example.com/" + eventid + ".html"));
break;
}
}
}
if (WB.Url.ToString().IndexOf(eventid) > -1)
{
var node = docy.DocumentNode.SelectSingleNode(".//*[@id='mid']//dıv[6]//dıv//table[4]//tbody//tr[2]//td[1]");
while (WB.IsBusy)
Application.DoEvents();
for (int i = 0; i < 500; i++)
if (WB.ReadyState != WebBrowserReadyState.Complete)
{
Application.DoEvents();
await Task.Delay(5000);
Thread.Sleep(10);
}
else
break;
Application.DoEvents();
WB.Navigate(new Uri("https://www.example.com/page"));
}
}