在这种情况下,它不能正常工作,它会继续向List添加相同的链接。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using HtmlAgilityPack;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Net;
using System.Web;
namespace GatherLinks
{
public partial class Form1 : Form
{
int sites = 0;
int y = 0;
string url = @"http://www.google.co.il";
string guys = "http://www.google.com";
public Form1()
{
InitializeComponent();
List<string> a = webCrawler(guys, 2);
}
private void Form1_Load(object sender, EventArgs e)
{
}
private int factorial(int n)
{
if (n == 0) return 1;
else y = n * factorial(n - 1);
richTextBox1.Text = y.ToString();
return y;
}
private List<string> getLinks(HtmlAgilityPack.HtmlDocument document)
{
List<string> mainLinks = new List<string>();
var linkNodes = document.DocumentNode.SelectNodes("//a[@href]");
if (linkNodes != null)
{
foreach (HtmlNode link in linkNodes)
{
var href = link.Attributes["href"].Value;
mainLinks.Add(href);
}
}
return mainLinks;
}
private List<string> webCrawler(string url, int levels)
{
HtmlAgilityPack.HtmlDocument doc;
HtmlWeb hw = new HtmlWeb();
List<string> webSites;// = new List<string>();
List<string> csFiles = new List<string>();
csFiles.Add("temp string to know that something is happening in level = " + levels.ToString());
csFiles.Add("current site name in this level is : " + url);
try
{
doc = hw.Load(url);
webSites = getLinks(doc);
if (levels == 0)
{
return csFiles;
}
else
{
int actual_sites = 0;
for (int i = 0; i < webSites.Count() && i < 20; i++) {
string t = webSites[i];
if ((t.StartsWith("http://") == true) || (t.StartsWith("https://") == true)) // replace this with future FilterJunkLinks function
{
// for (int e = 0; e < csFiles.Count; e++)
// {
if (csFiles.Contains(t))
{
}
else
{
actual_sites++;
csFiles.AddRange(webCrawler(t, levels - 1));
Texts(richTextBox1, "Level Number " + levels + " " + t + Environment.NewLine, Color.Red);
}
// }
}
}
// report to a message box only at high levels..
//if (levels==1)
//MessageBox.Show(actual_sites.ToString());
return csFiles;
}
}
catch
{
return csFiles;
}
}
文本功能:
public void Texts(RichTextBox box, string text, Color color)
{
box.SelectionStart = box.TextLength;
box.SelectionLength = 0;
box.SelectionColor = color;
box.AppendText(text);
box.SelectionColor = box.ForeColor;
}
我需要在webCrawler函数中做两件事:
如果无法解析url变量,那么try和catch应该可以完成这项工作。
如果List csFiles已包含相同的项目,请不要再次添加它们。例如,如果在csFiles中已经http://www.google.com,那么就不要再添加http://www.google.com,所以最后csFiles列表只包含http://www.google.com一次。
答案 0 :(得分:0)
您对包含的使用是错误的。如果我理解你的问题......我会勇敢地冒险猜测......
for (int i = 0; i < webSites.Length && i < 20; i++)
{
if ((t.StartsWith("http://") == true) || (t.StartsWith("https://") == true)) // replace this with future FilterJunkLinks function
{
if (csFiles.Contains(t))
{
//dosomething
}
else
{
actual_sites++;
csFiles.AddRange(webCrawler(t, levels - 1));
Texts(richTextBox1, "Level Number " + levels + " " + t + Environment.NewLine,
Color.Red);
}
}
}
}
更新: 我对递归知之甚少。事实上,我避免它,但我认为这是你的另一个问题。 当您递归调用代码时,您“忘记”包含所有链接的上一个列表。因此,每次我建议您传入对webcrawler中列表的引用时,不要创建新的csFile。希望能解决它。
含义:
private List<string> webCrawler(string url, int levels,List<string> csFiles)