Question

在这种情况下，它不能正常工作，它会继续向List添加相同的链接。

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using HtmlAgilityPack;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Net;
using System.Web;


namespace GatherLinks
{
    public partial class Form1 : Form
    {
        int sites = 0;
        int y = 0;
        string url = @"http://www.google.co.il";
        string guys = "http://www.google.com";

        public Form1()
        {
            InitializeComponent();

            List<string> a = webCrawler(guys, 2);
        }

        private void Form1_Load(object sender, EventArgs e)
        {

        }

        private int factorial(int n)
        {
            if (n == 0) return 1;
            else y = n * factorial(n - 1);
            richTextBox1.Text = y.ToString();
            return y;


        }

        private List<string> getLinks(HtmlAgilityPack.HtmlDocument document)
        {

            List<string> mainLinks = new List<string>();
            var linkNodes = document.DocumentNode.SelectNodes("//a[@href]");
            if (linkNodes != null)
            {
                foreach (HtmlNode link in linkNodes)
                {
                    var href = link.Attributes["href"].Value;
                    mainLinks.Add(href);
                }
            }
            return mainLinks;

        }


        private List<string> webCrawler(string url, int levels)
        {

                HtmlAgilityPack.HtmlDocument doc;
                HtmlWeb hw = new HtmlWeb();
                List<string> webSites;// = new List<string>();
                List<string> csFiles = new List<string>();

                csFiles.Add("temp string to know that something is happening in level = " + levels.ToString());
                csFiles.Add("current site name in this level is : " + url);
                                try
                {
                    doc = hw.Load(url);
                    webSites = getLinks(doc);

                    if (levels == 0)
                    {
                        return csFiles;
                    }
                    else
                    {
                        int actual_sites = 0;
                        for (int i = 0; i < webSites.Count() && i < 20; i++)                         {
                            string t = webSites[i];
                                                        if ((t.StartsWith("http://") == true) || (t.StartsWith("https://") == true)) // replace this with future FilterJunkLinks function
                            {
                               // for (int e = 0; e < csFiles.Count; e++)
                               // {
                                    if (csFiles.Contains(t))
                                    {
                                    }
                                    else
                                    {
                                        actual_sites++;
                                        csFiles.AddRange(webCrawler(t, levels - 1));
                                        Texts(richTextBox1, "Level Number " + levels + " " + t + Environment.NewLine, Color.Red);
                                    }
                               // }
                            }
                        }
                        // report to a message box only at high levels..
                        //if (levels==1)
                        //MessageBox.Show(actual_sites.ToString());

                        return csFiles;
                    }



                }
                catch
                {
                    return csFiles;
                }

        }

文本功能：

public void Texts(RichTextBox box, string text, Color color)
        {
            box.SelectionStart = box.TextLength;
            box.SelectionLength = 0;

            box.SelectionColor = color;
            box.AppendText(text);
            box.SelectionColor = box.ForeColor;
        }

我需要在webCrawler函数中做两件事：

如果无法解析url变量，那么try和catch应该可以完成这项工作。
如果List csFiles已包含相同的项目，请不要再次添加它们。例如，如果在csFiles中已经http://www.google.com，那么就不要再添加http://www.google.com，所以最后csFiles列表只包含http://www.google.com一次。

Answer 1

您对包含的使用是错误的。如果我理解你的问题......我会勇敢地冒险猜测......

for (int i = 0; i < webSites.Length && i < 20; i++)
{

   if ((t.StartsWith("http://") == true) || (t.StartsWith("https://") == true)) // replace this with future FilterJunkLinks function
   {
     if (csFiles.Contains(t))
     {
     //dosomething
     }
     else
     {
        actual_sites++;
        csFiles.AddRange(webCrawler(t, levels - 1));
        Texts(richTextBox1, "Level Number " + levels + " " + t + Environment.NewLine,  
        Color.Red);
     }
   }
}

}

更新：我对递归知之甚少。事实上，我避免它，但我认为这是你的另一个问题。当您递归调用代码时，您“忘记”包含所有链接的上一个列表。因此，每次我建议您传入对webcrawler中列表的引用时，不要创建新的csFile。希望能解决它。

含义：

private List<string> webCrawler(string url, int levels,List<string> csFiles)

如何搜索List <string>并检查Web地址是否已存在？</string>

1 个答案: