Web浏览器源为空

时间:2019-01-24 13:54:01

标签: c#

public async void ScrapeBing()
        {
            TabPage tab = new TabPage();
            tab.Text = "Bing";
            tabControl1.Controls.Add(tab);
            bingTab = new WebBrowser();
            bingTab.Parent = tab;
            bingTab.Dock = DockStyle.Fill;
            for (int j = 0; j < urlList.Items.Count; j++)
            {
                bingTab.Navigate("https://www.bing.com/search?q=" + urlList.Items[j].ToString());
                await PageLoad(30, 5, bingTab);
                string pageSource = this.bingTab.DocumentText.ToString();
                MessageBox.Show(pageSource);
                string regexExpression = "(?<=<li class=\"b_algo\"><h2><a href=\")(.*?)(?=\" h=\"ID=SERP)";
                Regex match = new Regex(regexExpression, RegexOptions.Singleline);
                MatchCollection collection = Regex.Matches(pageSource, regexExpression);
                MessageBox.Show(collection.Count.ToString());
                for (int i = 0; i < collection.Count; i++)
                {
                    CommonCodes.WriteToTxt(collection[i].ToString(), "bingscrapedurls.txt");
                    MessageBox.Show(collection[i].ToString());
                }

                while (true)
                {

                    if (bingTab.DocumentText.Contains("=\"sb_pagN sb_pagN_bp b_widePag sb_bp \" title=\"Next page\" href=\""))
                    {
                        string nextLinkRegex = "(?<=\" title = \"Next page\" href = \")(.*?)(?=\" h = \")";
                        Regex rg = new Regex(nextLinkRegex);
                        string nextpgUrl = rg.Match(bingTab.DocumentText).ToString();
                        MessageBox.Show(nextpgUrl.ToString());
                        bingTab.Navigate("https://www.bing.com" + nextpgUrl);
                        await PageLoad(30, 5, bingTab);
                        ScrapePages(bingTab.DocumentText, "(?<==\"b_algo\"><h2><a href=\"><a href=\")(.*?)(?=\" h = \"ID=SERP)", "bingscrapedurls.txt");
                    }
                    else
                    {
                        break;
                    }

                }
                if (j == urlList.Items.Count - 1)
                {
                    MessageBox.Show("URL's Scraped.");
                }

            }
        }

PageLoad方法:

private async Task PageLoad(int TimeOut, int delay, WebBrowser w)
        {
            try
            {
                TaskCompletionSource<bool> PageLoaded = null;
                PageLoaded = new TaskCompletionSource<bool>();
                int TimeElapsed = 0;
                w.DocumentCompleted += (s, e) =>
                {
                    if (w.ReadyState != WebBrowserReadyState.Complete) return;
                    if (PageLoaded.Task.IsCompleted) return; PageLoaded.SetResult(true);
                };
                //
                while (PageLoaded.Task.Status != TaskStatus.RanToCompletion)
                {
                    await Task.Delay(delay * 1000);//interval of 10 ms worked good for me
                    TimeElapsed++;
                    if (TimeElapsed >= TimeOut * 100) PageLoaded.TrySetResult(true);
                }
            }
            catch (Exception ex)
            {
                CommonCodes.WriteLog(ex.ToString());
                MessageBox.Show(ex.Message);
            }

}

当我在bing.com的代码行中使用空源代码时,我为google.com页面应用了相同的代码

string pageSource = this.bingTab.DocumentText.ToString();`

-

有什么想法吗?谢谢。我试过DocumentStream太没用了。不知道为什么会这样。 ScrapBing方法将从bing网页中抓取网址,而PageLoad是等待页面加载完成的方法。

0 个答案:

没有答案