C#相当新。
我写了一个网络刮刀来收集网站上的数据。由于我想要的数据是通过JavaScript或其他东西显示的,所以无法从HTML中检索数据,因此我需要使用WebBrowser访问呈现的网站。这不包括使用例如WebClient类。
我想在一小时内收集网站上10个不同页面的数据,但下面的代码只允许我一次做2个。如果我开始第三个程序,第一个程序就会停止。谷歌搜索后,我尝试通过添加
来解决这个问题System.Net.ServicePointManager.DefaultConnectionLimit = 1000;
但这绝对没有。
我还在开发中,所以现在我正在为我正在抓取的每个页面运行一个单独的Windows窗体。
这是我的代码: (我添加了A变量,因为该站点完成加载4次)。
public partial class Form1 : Form
{
//GLOBAL VARIABLES
int A = 0;
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
RunProgram();
}
void RunProgram()
{
System.Net.ServicePointManager.DefaultConnectionLimit = 1000;
Uri link1 = new Uri("http://www.somesite.com/sdf4575gfn");
WebBrowser wb = new WebBrowser();
wb.AllowNavigation = true;
wb.Navigate(link1);
wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);
}
//WebSite loaded
private void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
WebBrowser wb = sender as WebBrowser;
A = A + 1;
String content = "";
if (A == 4)
{
wb.Document.ExecCommand("SelectAll", false, null);
wb.Document.ExecCommand("Copy", false, null);
content = Clipboard.GetText();
//Store to file
}
}
}
答案 0 :(得分:0)
每次获取数据后,请丢弃webbrowser,然后将其重复用于每个URL。循环遍历所有网址,您将获得近乎同步的屏幕抓取。另外,制作更多的webbrowser实例和错开屏幕的错误是什么?
答案 1 :(得分:0)
我试图做到这一点,但大部分时间它对我有用,有时不能完美,但也许这会对你有帮助,
pageLoaded = false;
string url = "https://someurl.com" ;
webBrowser1.DocumentCompleted += browser_DocumentCompleted;
webBrowser1.Navigate(url);
while (pageLoaded == false)
{
Thread.Sleep(500);
Application.DoEvents();
}
result = (webBrowser1.Document.GetElementById("someid"));
value = result.InnerText;
void browser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
pageLoaded = true;
}
答案 2 :(得分:0)
好的,所以我设法让它工作得益于所提出的建议。感谢所有贡献的人。
这是我的代码的精简版。如果您更改siteID并稍微清理代码,它应该适合您。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace WebBrowserTesting
{
public partial class Form1 : Form
{
//siteID
string[] siteID =
{
"http://www.somesite.com/3jhurjkrtukty",
"http://www.somesite.com/dfb87uhs89h7df9g",
"http://www.somesite.com/mfg5t456rj"
};
//Event counters
int K1 = 0;
int K2 = 0;
int K3 = 0;
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
runProgram();
}
void runProgram()
{
for(int k = 0; k < siteID.Length; k++)
{
WebBrowser wb1 = new WebBrowser();
Uri url1 = new Uri(siteID[k]);
wb1.DocumentCompleted += wb_DocumentCompleted;
wb1.Navigate(url1);
}
}
void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
WebBrowser inner = sender as WebBrowser;
int counter = updateCounter(inner.Url.ToString());
if (K1/3 == 4 || K2/3 == 4 || K3/3 == 4) //-------- Mysterious bug
{
//In my case the page isn't loaded until the fourth event
//Here the page is fully loaded. Starting a new thread
Crawler page = new Crawler();
Thread oThread = new Thread(() => page.scraper(inner));
oThread.Start();
}
}
//Page isn't loaded until the DocumentCompleted Event has fired 4 times.
int updateCounter(string kid)
{
int num = 99;
for (int k = 0; k < siteID.Length; k++)
{
if(String.Compare(kid, siteID[0]) == 0)
{
K1 = K1 + 1;
num = K1;
}
else if (String.Compare(kid, siteID[1]) == 0)
{
K2 = K2 + 1;
num = K2;
}
else if (String.Compare(kid, siteID[2]) == 0)
{
K3 = K3 + 1;
num = K3;
}
}
return num;
}
}
public class Crawler
{
public void scraper(WebBrowser inn)
{
int life = 0;
//Primitive loop for testing purposes
while (life < 1000)
{
if (life % 10 == 0 && life > 1)
{
Thread.Sleep(2000);
inn.Invoke(new Action(() => {
inn.Document.ExecCommand("SelectAll", false, null);
inn.Document.ExecCommand("Copy", false, null);
string content = Clipboard.GetText();
Console.WriteLine("Content : " + content);
//write content to file
}));
}
life = life + 1;
}
}
}
}