为什么我从链接列表中进行网页爬网的链接点击速度真的很慢? C#

时间:2019-05-09 19:50:59

标签: c# selenium web-scraping

我想单击所有带有文本“ 300”的链接。我的网页抓取代码非常缓慢地点击了每个链接。我将链接存储在列表中,然后一个一个地单击它们。

我计算用于索引的链接,然后使用for(int pos = 0; pos https://www.w3schools.com/html/default.asp]上的(By.PartialLinkText(“ 3600”)),它的响应速度非常快,但是在另一个站点上速度很慢。

class Program
{
    private static IWebDriver driver = null;

    static void Main(string[] args)
    {
        driver = new InternetExplorerDriver();
        driver.Manage().Window.Maximize();
        driver.Navigate().GoToUrl("https://arbitrary.com/");

        clickAllLinks("300");
    }

    //clicking links AND get data
    public static void clickAllLinks(string tagName)
    {
        IWebElement element = 
        driver.FindElement(By.XPath("//div[@class='data']"));

        int elements = 
        element.FindElements(By.PartialLinkText(tagName)).Count();

        for (int pos = 0; pos < elements; pos++)
        {
            getElementWithIndex(By.PartialLinkText(tagName), pos).Click();
            //fetchdata();
        }
    }

    public static IWebElement getElementWithIndex(By by, int pos)
    {
        IWebElement element = 
        driver.FindElement(By.XPath("//div[@class='data']"));
        IList<IWebElement> elements = 
        element.FindElements(By.PartialLinkText("300"));
        return elements.ElementAt(pos);
    }



    //scrape data
    public static async void fetchdata()
    {
        string currentURL = driver.Url; //url to string
        Console.WriteLine("URL: " + currentURL);

        var httpclient = new HttpClient();
        var html = await httpclient.GetStringAsync(currentURL);

        var htmldoc = new HtmlDocument();
        htmldoc.LoadHtml(html); //html to htmldoc

      List<List<string>> Receipt = 
      htmldoc.DocumentNode.SelectSingleNode("//table[@class='classname']")
            //htmldoc into list TABLE->TR->TD->InnerText
            .Descendants("tr")
            .Where(tr => tr.Elements("td").Count() > 0)
            .Select(tr => tr.Elements("td")
            .ToList())
            .ToList();

1 个答案:

答案 0 :(得分:0)

这是clickAllLinks方法的简化版本。这样可以减少当前方法的开销(不必要地获取元素和存储元素,这可能会影响执行速度)。

//clicking links AND get data
public static void clickAllLinks(string tagName)
{
    int elements = 
    driver.FindElements(By.xpath("//div[@class='data']//a[contains(.," + tagName + ")]").Count();

    for (int pos = 1; pos < elements; pos++)
    {
        driver.FindElements(By.xpath("(//div[@class='data']//a[contains(.," + tagName + ")])[" +  pos + "]").Click();
        //fetchdata();
    }
}