我有以下蜘蛛,我很确定之前在工作:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from Funda.items import FundaItem
class FundaSpider(CrawlSpider):
# def __init__(self, url='http://www.funda.nl/koop/amsterdam'):
# self.start_urls = [url]
name = "Funda"
allowed_domains = ["funda.nl"]
start_urls = ["http://www.funda.nl/koop/amsterdam/"]
le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % start_urls[0])
le2 = LinkExtractor(allow=r'%s+p\d+' % start_urls[0]) # Page number links such as http://www.funda.nl/koop/amsterdam/p10/
rules = (
Rule(le1, callback='parse_item'),
Rule(le2, callback='get_max_page_number')
)
def parse_item(self, response):
links = self.le1.extract_links(response)
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'):
item = FundaItem()
item['url'] = link.url
yield item
def get_max_page_number(self, response):
links = self.le2.extract_links(response)
max_page_number = 0
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'):
page_number = int(link.url.split("/")[-2].strip('p'))
if page_number > max_page_number:
max_page_number = page_number
return max_page_number
其中items.py
是
import scrapy
class FundaItem(scrapy.Item):
url = scrapy.Field()
text = scrapy.Field()
如果我尝试使用scrapy crawl Funda
运行它,我会收到一条以
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 1173, in startRunning
ReactorBase.startRunning(self)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 684, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
我还使用来自CrawlerProcess
的{{1}}从脚本运行另一个蜘蛛,并怀疑这可能会干扰这个蜘蛛。怎么能让蜘蛛跑?