这是我的代码:
import requests
from bs4 import BeautifulSoup
import re
class WebCrawler():
def check(self, links):
global imgCount
for item in links:
targetURL = item['href']
if(targetURL.startswith('/')):
targetURL = target + targetURL # add http:// and hostname to url
target_html = requests.get(targetURL)
parsed_html = BeautifulSoup(target_html.text, 'html.parser')
if parsed_html.title.text not in pages:
pages.append(parsed_html.title.text)
print "[+] Collecting images page : " + parsed_html.title.text
images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
for img_url in images:
imgCount=imgCount + 1
# print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")
data = BeautifulSoup(requests.get(target).text, 'html.parser')
link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)
print "===================== Total Collected Images =====================\n"
print imgCount
我想在其他页面继续这样做。意味着它继续计数,直到没有任何链接。 当我调用check函数时,那不起作用!
import requests
from bs4 import BeautifulSoup
import re
class WebCrawler():
def check(self, links):
global imgCount
for item in links:
targetURL = item['href']
if(targetURL.startswith('/')):
targetURL = target + targetURL # add http:// and hostname to url
target_html = requests.get(targetURL)
parsed_html = BeautifulSoup(target_html.text, 'html.parser')
if parsed_html.title.text not in pages:
pages.append(parsed_html.title.text)
print "[+] Collecting images page : " + parsed_html.title.text
images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
for img_url in images:
imgCount=imgCount + 1
# print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
lnks = parsed_html.find_all('a')
self.check(lnks)
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")
data = BeautifulSoup(requests.get(target).text, 'html.parser')
link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)
print "===================== Total Collected Images =====================\n"
print imgCount
我将这些行添加到其中:
lnks = parsed_html.find_all('a')
self.check(lnks)
这次,循环只执行了一次!
答案 0 :(得分:1)
尝试这样的事情:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[@class="pl"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("a/text()").extract()
item["link"] = titles.xpath("a/@href").extract()
items.append(item)
return(items)