这是我的蜘蛛:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[@itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[@class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
我尝试抓取的网页格式为http://www.blabla.com/blabla/bla?page=x 其中x =任何整数。
我的问题是我的蜘蛛爬行除了第一个以外的所有页面! 任何想法为什么会发生这种情况?
提前谢谢!
答案 0 :(得分:2)
如果您查看scrapy doc,start_urls响应将转到**
解析
**方法
所以你可以像这样改变你的规则
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
和方法名称从def parse_vrisko(self, response):
到def parse(self, response):
或者您可以移除start_urls并使用def start_requests(self):
启动您的蜘蛛并回调parse_vrisko