在WebBrowser

时间:2016-01-16 19:59:09

标签: c# web-scraping

C#相当新。

我写了一个网络刮刀来收集网站上的数据。由于我想要的数据是通过JavaScript或其他东西显示的,所以无法从HTML中检索数据,因此我需要使用WebBrowser访问呈现的网站。这不包括使用例如WebClient类。

我想在一小时内收集网站上10个不同页面的数据,但下面的代码只允许我一次做2个。如果我开始第三个程序,第一个程序就会停止。谷歌搜索后,我尝试通过添加

来解决这个问题
System.Net.ServicePointManager.DefaultConnectionLimit = 1000;

但这绝对没有。

我还在开发中,所以现在我正在为我正在抓取的每个页面运行一个单独的Windows窗体。

这是我的代码: (我添加了A变量,因为该站点完成加载4次)。

public partial class Form1 : Form
{
    //GLOBAL VARIABLES
    int A = 0;

    public Form1()
    {
        InitializeComponent();
    }

    private void button1_Click(object sender, EventArgs e)
    {
        RunProgram();
    }

    void RunProgram()
    {
        System.Net.ServicePointManager.DefaultConnectionLimit = 1000;

        Uri link1 = new Uri("http://www.somesite.com/sdf4575gfn");

        WebBrowser wb = new WebBrowser();
        wb.AllowNavigation = true;
        wb.Navigate(link1);

        wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);
    }

    //WebSite loaded
    private void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
    {
        WebBrowser wb = sender as WebBrowser;

        A = A + 1;

        String content = "";

        if (A == 4)
        {
            wb.Document.ExecCommand("SelectAll", false, null);
            wb.Document.ExecCommand("Copy", false, null);
            content = Clipboard.GetText();
            //Store to file
        }
    }
}

3 个答案:

答案 0 :(得分:0)

每次获取数据后,请丢弃webbrowser,然后将其重复用于每个URL。循环遍历所有网址,您将获得近乎同步的屏幕抓取。另外,制作更多的webbrowser实例和错开屏幕的错误是什么?

答案 1 :(得分:0)

我试图做到这一点,但大部分时间它对我有用,有时不能完美,但也许这会对你有帮助,

            pageLoaded = false;
            string url = "https://someurl.com" ;
            webBrowser1.DocumentCompleted += browser_DocumentCompleted;
            webBrowser1.Navigate(url);
            while (pageLoaded == false)
            {
                Thread.Sleep(500);       
                Application.DoEvents();  
            }

            result = (webBrowser1.Document.GetElementById("someid"));
            value = result.InnerText;

        void browser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            pageLoaded = true;
        }

答案 2 :(得分:0)

好的,所以我设法让它工作得益于所提出的建议。感谢所有贡献的人。

这是我的代码的精简版。如果您更改siteID并稍微清理代码,它应该适合您。

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace WebBrowserTesting
{

    public partial class Form1 : Form
    {

        //siteID
        string[] siteID =
        {
            "http://www.somesite.com/3jhurjkrtukty",
            "http://www.somesite.com/dfb87uhs89h7df9g",
            "http://www.somesite.com/mfg5t456rj"

        };

        //Event counters
        int K1 = 0;
        int K2 = 0;
        int K3 = 0;

        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            runProgram();
        }

        void runProgram()
        {
            for(int k = 0; k < siteID.Length; k++)
            {
                WebBrowser wb1 = new WebBrowser();

                Uri url1 = new Uri(siteID[k]);

                wb1.DocumentCompleted += wb_DocumentCompleted;
                wb1.Navigate(url1);
            }
        }

        void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {

            WebBrowser inner = sender as WebBrowser;
            int counter = updateCounter(inner.Url.ToString());



            if (K1/3 == 4 || K2/3 == 4 || K3/3 == 4) //-------- Mysterious bug
            {
                //In my case the page isn't loaded until the fourth event
                //Here the page is fully loaded. Starting a new thread
                Crawler page = new Crawler();
                Thread oThread = new Thread(() => page.scraper(inner));
                oThread.Start();
            }
        }

        //Page isn't loaded until the DocumentCompleted Event has fired 4 times.
        int updateCounter(string kid)
        {
            int num = 99;

            for (int k = 0; k < siteID.Length; k++)
            {
                if(String.Compare(kid, siteID[0]) == 0)
                {

                    K1 = K1 + 1;
                    num = K1;
                }
                else if (String.Compare(kid, siteID[1]) == 0)
                {
                    K2 = K2 + 1;
                    num = K2;

                }
                else if (String.Compare(kid, siteID[2]) == 0)
                {
                    K3 = K3 + 1;
                    num = K3;

                }
            }
            return num;
        }

    }

    public class Crawler
    {
        public void scraper(WebBrowser inn)
        {

            int life = 0;

            //Primitive loop for testing purposes
            while (life < 1000)
            {
                if (life % 10 == 0 && life > 1)
                {
                    Thread.Sleep(2000);

                    inn.Invoke(new Action(() => {
                        inn.Document.ExecCommand("SelectAll", false, null);
                        inn.Document.ExecCommand("Copy", false, null);
                        string content = Clipboard.GetText();
                        Console.WriteLine("Content : " + content);
                        //write content to file

                    }));

                }
                life = life + 1;
            }
        }

    }
}