这就是我的蜘蛛的样子:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy import Selector
from scrapy.http import Request
from tripadvisor.items import TripadvisorItem,TripadvisorItem2
from tripadvisor.id_generator import id_maker
from scrapy import log
class TripadvisorSpider(CrawlSpider):
name ="trippy5"
allowed_domanis=["tripadvisor.com"]
start_urls=[
'http://www.tripadvisor.com/Hotels-g187371-Cologne_North_Rhine_Westphalia-Hotels.html']
rules = (Rule(LxmlLinkExtractor(allow=(),restrict_xpaths=('//*[@class="guiArw sprite-pageNext "]')),
callback="parse_start_url", follow= True),
Rule(LxmlLinkExtractor(allow=(),restrict_xpaths=('//*[@class="guiArw sprite-pageNext pid4181"]')),
callback="parse_hotel", follow= True),)
def __init__(self, *a, **kw):
super(TripadvisorSpider, self).__init__(*a, **kw)
self._compile_rules()
logfile = open('tripadvisor_log.log', 'w')
log_observer = log.ScrapyFileLogObserver(logfile, level=log.DEBUG)
log_observer.start()
def parse_start_url(self, response):
sel = Selector(response)
titles = sel.xpath('//div[@class="metaLocationInfo"]')
for title in titles:
hotel_id=id_maker()
item = TripadvisorItem()
item['id_hotel']=hotel_id
item ["name"] = (title.xpath('.//a[@class="property_title"]/text()').extract()[0]).strip()
star=title.xpath('.//img[@class="sprite-ratings-gry"]/@alt').extract()
if star:
star=star[0]
star=star[0:3]
item ["stars"] =star
else:
item ["stars"] =999
overall_rating=title.xpath('.//img[@class="sprite-ratings"]/@alt').extract()
if overall_rating:
overall_rating=overall_rating[0]
if overall_rating[1]=='.':
overall_rating=overall_rating[0:3]
else:
overall_rating=overall_rating[0]
item ["overall_rating"]=overall_rating
else:
item ["overall_rating"]=999
link=title.xpath('.//a[@class="property_title"]/@href').extract()[0]
url = "http://tripadvisor.com{}".format(link)
yield Request(url=url, meta={'item': item}, callback=self.parse_hotel)
def parse_hotel(self, response):
items=[]
item = response.meta['item']
sel = Selector(response)
review_boxes =sel.xpath('//*[@class="reviewSelector "]')
if review_boxes:
for box in review_boxes:
if box.xpath('.//span[@class="noQuotes"]/text()').extract():
item2=TripadvisorItem2()
item2 ['id_hotel']=item['id_hotel']
item2 ["hotel_name"]=item ["name"]
item2 ["review_title"] = box.xpath('.//span[@class="noQuotes"]/text()').extract()[0]
item2 ["review_rate"] =box.xpath('.//div/div/span/img/@alt').extract()[0][0]
if box.xpath('.//div/div/span/@title').extract():
item2 ["review_date"] =box.xpath('.//div/div/span/@title').extract()[0]
else:
item2 ["review_date"] =(box.xpath('.//div/div/span[@class="ratingDate"]/text()').extract()[0]).replace('Reviewed ','')
contributor_name=box.xpath('.//div[@class="username mo"]/span/text()').extract()
if contributor_name:
item2 ["contributor_name"] =contributor_name[0]
location=box.xpath('.//div[@class="location"]/text()').extract()
if location:
if location==[u'\n']:
item2 ["contributor_location"]='N/A'
else:
item2 ["contributor_location"]=(location[0]).strip()
overall_contributions=box.xpath('.//span[@class="badgeText"]/text()').extract()
if overall_contributions:
item2 ["overall_contributions"] =((overall_contributions[0]).replace(' review','')).replace('s','')
else:
item2 ["overall_contributions"] =999
item2 ["text"] =(box.xpath('.//div[@class="entry"]/p/text()').extract())[0].strip()
items.append(item2)
yield item2
第一条规则成功运作,浏览列出酒店的所有网页。
蜘蛛只能通过每家酒店评论的第一页抓取,不幸的是忽略了第二条规则应该让它通过评论页面进行爬行。
由于不同的回调和不同的xpath,我认为How to fix scrapy rules when only one rule is followed不适用于此。
我请求帮助!