以下固定start_urls
的蜘蛛可以使用:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class NumberOfPagesSpider(CrawlSpider):
name = "number_of_pages"
allowed_domains = ["funda.nl"]
# def __init__(self, place='amsterdam'):
# self.start_urls = ["http://www.funda.nl/koop/%s/" % place]
start_urls = ["http://www.funda.nl/koop/amsterdam/"]
le_maxpage = LinkExtractor(allow=r'%s+p\d+' % start_urls[0])
rules = (Rule(le_maxpage, callback='get_max_page_number'),)
def get_max_page_number(self, response):
links = self.le_maxpage.extract_links(response)
max_page_number = 0 # Initialize the maximum page number
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'): # Select only pages with a link depth of 3
page_number = int(link.url.split("/")[-2].strip('p')) # For example, get the number 10 out of the string 'http://www.funda.nl/koop/amsterdam/p10/'
if page_number > max_page_number:
max_page_number = page_number # Update the maximum page number if the current value is larger than its previous value
filename = "max_pages.txt" # File name with as prefix the place name
with open(filename,'wb') as f:
f.write('max_page_number = %s' % max_page_number) # Write the maximum page number to a text file
如果我按scrapy crawl number_of_pages
运行它,它会按预期写入.txt文件。但是,如果我通过在def __init__
行中注释并注释掉start_urls =
行来修改它,并尝试使用用户定义的输入参数运行它,
scrapy crawl number_of_pages -a place=amsterdam
我收到以下错误:
le_maxpage = LinkExtractor(allow=r'%s+p\d+' % start_urls[0])
NameError: name 'start_urls' is not defined
因此根据蜘蛛,start_urls
未定义,即使在代码中它在初始化中完全确定。如何让这个蜘蛛与输入参数定义的start_urls
一起使用?
答案 0 :(得分:2)
您的le_maxpage
是一个班级变量。将参数传递给__init__
后,您将创建实例级变量start_urls
。
您在start_urls
中使用了le_maxpage
,因此要使le_maxpage
变量起作用,需要有一个名为start_urls
的类级别变量。
要解决此问题,您需要将类级别变量移动到实例级别,即在__init__
块中定义它们。
答案 1 :(得分:1)
按照masnun的回答,我设法解决了这个问题。为了完整起见,我列出了下面的更新代码。
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class NumberOfPagesSpider(CrawlSpider):
name = "number_of_pages"
allowed_domains = ["funda.nl"]
def __init__(self, place='amsterdam'):
self.start_urls = ["http://www.funda.nl/koop/%s/" % place]
self.le_maxpage = LinkExtractor(allow=r'%s+p\d+' % self.start_urls[0])
rules = (Rule(self.le_maxpage, ),)
def parse(self, response):
links = self.le_maxpage.extract_links(response)
max_page_number = 0 # Initialize the maximum page number
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'): # Select only pages with a link depth of 3
page_number = int(link.url.split("/")[-2].strip('p')) # For example, get the number 10 out of the string 'http://www.funda.nl/koop/amsterdam/p10/'
if page_number > max_page_number:
max_page_number = page_number # Update the maximum page number if the current value is larger than its previous value
filename = "max_pages.txt" # File name with as prefix the place name
with open(filename,'wb') as f:
f.write('max_page_number = %s' % max_page_number) # Write the maximum page number to a text file
请注意,Rule
甚至不需要callback
,因为始终会调用parse
。