python中的递归web爬行

时间:2017-03-19 15:03:04

标签: python python-2.7 beautifulsoup web-crawler

这是我的代码:

import requests
from bs4 import BeautifulSoup
import re

class WebCrawler():
    def check(self, links):
        global imgCount
        for item in links:
            targetURL = item['href']
            if(targetURL.startswith('/')):
                targetURL = target + targetURL  # add http:// and hostname to url

            target_html = requests.get(targetURL)
            parsed_html = BeautifulSoup(target_html.text, 'html.parser')
            if parsed_html.title.text not in pages:
                pages.append(parsed_html.title.text)
                print "[+] Collecting images page : " + parsed_html.title.text
                images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
                for img_url in images:
                   imgCount=imgCount + 1
                    # print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")

data = BeautifulSoup(requests.get(target).text, 'html.parser')

link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)

print "===================== Total Collected Images =====================\n"
print imgCount

我想在其他页面继续这样做。意味着它继续计数,直到没有任何链接。 当我调用check函数时,那不起作用!

import requests
from bs4 import BeautifulSoup
import re

class WebCrawler():
    def check(self, links):
        global imgCount
        for item in links:
            targetURL = item['href']
            if(targetURL.startswith('/')):
                targetURL = target + targetURL  # add http:// and hostname to url

            target_html = requests.get(targetURL)
            parsed_html = BeautifulSoup(target_html.text, 'html.parser')
            if parsed_html.title.text not in pages:
                pages.append(parsed_html.title.text)
                print "[+] Collecting images page : " + parsed_html.title.text
                images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
                for img_url in images:
                   imgCount=imgCount + 1
                    # print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
            lnks = parsed_html.find_all('a')
            self.check(lnks)


pages = []
imgCount = 0
target = raw_input("Please enter base url: ")

data = BeautifulSoup(requests.get(target).text, 'html.parser')

link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)

print "===================== Total Collected Images =====================\n"
print imgCount  

我将这些行添加到其中:

lnks = parsed_html.find_all('a')
self.check(lnks)

这次,循环只执行了一次!

1 个答案:

答案 0 :(得分:1)

尝试这样的事情:

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem

class MySpider(CrawlSpider):
    name = "craigs"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = ["http://sfbay.craigslist.org/search/npo"]

    rules = (
        Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_items", follow= True),
    )

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.xpath('//span[@class="pl"]')
        items = []
        for titles in titles:
            item = CraigslistSampleItem()
            item["title"] = titles.xpath("a/text()").extract()
            item["link"] = titles.xpath("a/@href").extract()
            items.append(item)
        return(items)