我在c#中创建了一个简单的应用程序,它使用HtmlAgilityPack
来抓取xPath
内容。我只是解析网站并提取块。
我的基本代码:
private void button1_Click(object sender, EventArgs e)
{
OpenFileDialog ofd = new OpenFileDialog();
ofd.Filter = ".txt|*.txt";
if (ofd.ShowDialog() == DialogResult.OK) //if I opened a file
{
label4.Text = ofd.SafeFileName;
string line;
string my_site = textBox1.Text;
System.IO.StreamReader file = new System.IO.StreamReader(ofd.FileName);
while ((line = file.ReadLine()) != null) //load line by line
{
try
{
var Webget = new HtmlWeb();
var doc = Webget.Load(line); //obtain source code
int mine = Convert.ToInt32(scrape_mine(doc, my_site)); //scrape value and convert to Int
int top = Convert.ToInt32(scrape_top(doc)); //scrape value and convert to Int
if (top == 0)
top = 1;
dataGridView1.Rows.Add(line, top - mine); //add scraped values to dataGrid
} //try
catch (ArgumentNullException) //if I couldn't scrape value
{
try //but source code contains "example"
{
string sourceCode;
sourceCode = Worker.getSourceCode(line);
Regex r = new Regex(@"(example)");
var m = r.Match(sourceCode);
if (m.Groups[1].Value.ToString() == "example")
dataGridView1.Rows.Add(line, "ANOTHER");
else
dataGridView1.Rows.Add(line, "NONE");
}
catch //if I couldn't scrape value and source code does not contains "example"
{
dataGridView1.Rows.Add(line, "NONE");
}
} //catch
catch (UriFormatException) //if I couldn't load website
{
dataGridView1.Rows.Add(line, "WRONG URL");
}
} //while
countRows(dataGridView1); //function which is sorting extracted data
MessageBox.Show("Finished!");
} //if
} //button
我想我不需要粘贴整个代码。问题是,正如您所看到的那样,我逐行采用并解析每个源代码。从1个站点解析5000个网址大约需要15分钟(ping~50ms)。有没有办法减少时间?