Tripadvisor上的Scrapy,Crawling评论:如何应用双递归规则?

时间:2014-11-04 12:52:48

标签: python web-scraping scrapy scrapy-spider

这就是我的蜘蛛的样子:

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy import Selector
from scrapy.http import Request
from tripadvisor.items import TripadvisorItem,TripadvisorItem2
from tripadvisor.id_generator import id_maker
from scrapy import log

class TripadvisorSpider(CrawlSpider):
    name ="trippy5"
    allowed_domanis=["tripadvisor.com"] 
    start_urls=[
    'http://www.tripadvisor.com/Hotels-g187371-Cologne_North_Rhine_Westphalia-Hotels.html']

    rules = (Rule(LxmlLinkExtractor(allow=(),restrict_xpaths=('//*[@class="guiArw sprite-pageNext "]')),
            callback="parse_start_url",  follow= True),
           Rule(LxmlLinkExtractor(allow=(),restrict_xpaths=('//*[@class="guiArw sprite-pageNext  pid4181"]')),
            callback="parse_hotel",  follow= True),)


    def __init__(self, *a, **kw):
        super(TripadvisorSpider, self).__init__(*a, **kw)
        self._compile_rules()
        logfile = open('tripadvisor_log.log', 'w')
        log_observer = log.ScrapyFileLogObserver(logfile, level=log.DEBUG)
        log_observer.start()
    def parse_start_url(self, response):

        sel = Selector(response)
        titles = sel.xpath('//div[@class="metaLocationInfo"]')
        for title in titles:
            hotel_id=id_maker()
            item = TripadvisorItem()
            item['id_hotel']=hotel_id
            item ["name"] = (title.xpath('.//a[@class="property_title"]/text()').extract()[0]).strip()
            star=title.xpath('.//img[@class="sprite-ratings-gry"]/@alt').extract()

            if star:
                star=star[0]       
            star=star[0:3]                
            item ["stars"] =star 
        else:
            item ["stars"] =999

            overall_rating=title.xpath('.//img[@class="sprite-ratings"]/@alt').extract()
            if overall_rating:
                overall_rating=overall_rating[0]
                if overall_rating[1]=='.':
                    overall_rating=overall_rating[0:3]
                else:
                    overall_rating=overall_rating[0]

                item ["overall_rating"]=overall_rating
            else:
                item ["overall_rating"]=999


            link=title.xpath('.//a[@class="property_title"]/@href').extract()[0]            
            url = "http://tripadvisor.com{}".format(link)

            yield Request(url=url, meta={'item': item}, callback=self.parse_hotel)    


    def parse_hotel(self, response):
        items=[]
        item = response.meta['item']
        sel = Selector(response)

        review_boxes =sel.xpath('//*[@class="reviewSelector "]')        
        if review_boxes:            
            for box in review_boxes:
                if box.xpath('.//span[@class="noQuotes"]/text()').extract():
                    item2=TripadvisorItem2()
                    item2 ['id_hotel']=item['id_hotel']
                    item2 ["hotel_name"]=item ["name"]
                    item2 ["review_title"] = box.xpath('.//span[@class="noQuotes"]/text()').extract()[0]
                    item2 ["review_rate"] =box.xpath('.//div/div/span/img/@alt').extract()[0][0]                  
                    if box.xpath('.//div/div/span/@title').extract():                    
                        item2 ["review_date"] =box.xpath('.//div/div/span/@title').extract()[0]                    
                    else:                        
                        item2 ["review_date"] =(box.xpath('.//div/div/span[@class="ratingDate"]/text()').extract()[0]).replace('Reviewed ','')                  
                    contributor_name=box.xpath('.//div[@class="username mo"]/span/text()').extract()
                    if contributor_name:
                        item2 ["contributor_name"] =contributor_name[0]
                    location=box.xpath('.//div[@class="location"]/text()').extract()                   
                    if location:
                        if location==[u'\n']:
                            item2 ["contributor_location"]='N/A' 
                        else:
                            item2 ["contributor_location"]=(location[0]).strip()
                    overall_contributions=box.xpath('.//span[@class="badgeText"]/text()').extract()
                    if overall_contributions:
                        item2 ["overall_contributions"] =((overall_contributions[0]).replace(' review','')).replace('s','')
                    else:
                        item2 ["overall_contributions"] =999
                    item2 ["text"] =(box.xpath('.//div[@class="entry"]/p/text()').extract())[0].strip()
                    items.append(item2)

                    yield item2

第一条规则成功运作,浏览列出酒店的所有网页。

蜘蛛只能通过每家酒店评论的第一页抓取,不幸的是忽略了第二条规则应该让它通过评论页面进行爬行。

由于不同的回调和不同的xpath,我认为How to fix scrapy rules when only one rule is followed不适用于此。

我请求帮助!

0 个答案:

没有答案