public async void ScrapeBing()
{
TabPage tab = new TabPage();
tab.Text = "Bing";
tabControl1.Controls.Add(tab);
bingTab = new WebBrowser();
bingTab.Parent = tab;
bingTab.Dock = DockStyle.Fill;
for (int j = 0; j < urlList.Items.Count; j++)
{
bingTab.Navigate("https://www.bing.com/search?q=" + urlList.Items[j].ToString());
await PageLoad(30, 5, bingTab);
string pageSource = this.bingTab.DocumentText.ToString();
MessageBox.Show(pageSource);
string regexExpression = "(?<=<li class=\"b_algo\"><h2><a href=\")(.*?)(?=\" h=\"ID=SERP)";
Regex match = new Regex(regexExpression, RegexOptions.Singleline);
MatchCollection collection = Regex.Matches(pageSource, regexExpression);
MessageBox.Show(collection.Count.ToString());
for (int i = 0; i < collection.Count; i++)
{
CommonCodes.WriteToTxt(collection[i].ToString(), "bingscrapedurls.txt");
MessageBox.Show(collection[i].ToString());
}
while (true)
{
if (bingTab.DocumentText.Contains("=\"sb_pagN sb_pagN_bp b_widePag sb_bp \" title=\"Next page\" href=\""))
{
string nextLinkRegex = "(?<=\" title = \"Next page\" href = \")(.*?)(?=\" h = \")";
Regex rg = new Regex(nextLinkRegex);
string nextpgUrl = rg.Match(bingTab.DocumentText).ToString();
MessageBox.Show(nextpgUrl.ToString());
bingTab.Navigate("https://www.bing.com" + nextpgUrl);
await PageLoad(30, 5, bingTab);
ScrapePages(bingTab.DocumentText, "(?<==\"b_algo\"><h2><a href=\"><a href=\")(.*?)(?=\" h = \"ID=SERP)", "bingscrapedurls.txt");
}
else
{
break;
}
}
if (j == urlList.Items.Count - 1)
{
MessageBox.Show("URL's Scraped.");
}
}
}
PageLoad方法:
private async Task PageLoad(int TimeOut, int delay, WebBrowser w)
{
try
{
TaskCompletionSource<bool> PageLoaded = null;
PageLoaded = new TaskCompletionSource<bool>();
int TimeElapsed = 0;
w.DocumentCompleted += (s, e) =>
{
if (w.ReadyState != WebBrowserReadyState.Complete) return;
if (PageLoaded.Task.IsCompleted) return; PageLoaded.SetResult(true);
};
//
while (PageLoaded.Task.Status != TaskStatus.RanToCompletion)
{
await Task.Delay(delay * 1000);//interval of 10 ms worked good for me
TimeElapsed++;
if (TimeElapsed >= TimeOut * 100) PageLoaded.TrySetResult(true);
}
}
catch (Exception ex)
{
CommonCodes.WriteLog(ex.ToString());
MessageBox.Show(ex.Message);
}
当我在bing.com的代码行中使用空源代码时,我为google.com页面应用了相同的代码
string pageSource = this.bingTab.DocumentText.ToString();`
有什么想法吗?谢谢。我试过DocumentStream太没用了。不知道为什么会这样。 ScrapBing方法将从bing网页中抓取网址,而PageLoad是等待页面加载完成的方法。