Scrapy:抓取页面但抓取0个项目

时间:2016-05-18 23:33:09

标签: python-2.7 scrapy

我正在试图刮掉baseball-reference.com。在我创建的Scrapy bot中,我从第1页开始,导航到不同的链接,然后从那里导航到第三个链接。请找到以下代码:

class VisitorBattingSpider(InitSpider):

    name = 'VisitorBatting'
    year=str(datetime.datetime.today().year)
    allowed_domains = ["baseball-reference.com"]
    start= 'http://www.baseball-reference.com/boxes/'+year+'.shtml'
    start_urls=[start]
    #rules = [Rule(LinkExtractor(allow=['/play-index/st.cgi?date=\d+-\d+-\d+']), callback='parse_item',)]

    def __init__(self):
        BaseSpider.__init__(self)
        # use any browser you wish
        self.browser = webdriver.Firefox() 

    def __del__(self):
        self.browser.close()


    def parse(self, response):
        self.browser.get(response.url)
        # let JavaScript Load
        time.sleep(15)
        page=Selector(text=self.browser.page_source)
        #page=Selector(response)
        sites=page.xpath('//*[@id="2016"]/tbody/tr/td/table/tbody/tr/td/a/@href')

        for site in sites:
            tree = site.extract()            
            yield Request(url='http://www.baseball-reference.com'+tree,callback=self.parse_new,dont_filter=True)
        self.browser.close()

    def parse_new(self, response):
        hxs=Selector(response)
        loads = hxs.xpath('/html/body/pre/a/@href')        

        for load in loads:
            branch=load.extract()
            if 'boxes' in branch:
                yield Request(url='http://www.baseball-reference.com'+branch,callback=self.parse_final,dont_filter=True)


    def parse_final(self, response):
        self.browser.get(response.url)
        fxs=Selector(text=self.browser.page_source)
        vi= fxs.xpath('html/body/div/div[3]/div[1]/div[1]/h3/text()').extract()
        vis=''.join(vi)
        if "." in vis:
            visitor=vis.replace(".","")
        else:
            visitor=vis
        visitor_id=visitor.replace(" ","")
        print visitor_id
        UR=response.url
        URL=''.join(UR)
        dtt=URL[-15:]
        dt=dtt[:8]
        day=datetime.datetime(int(dt[:4]),int(dt[5:6]),int(dt[-2:]),01,01,01).weekday()
        path = '//*[@id="'+visitor_id+'batting"]/tfoot/tr'
        webs=fxs.xpath(path)        
        items=[]

        for web in webs:
            item=VisitorbattingItem()
            item['ID']=response.url
            item['AWAY_TEAM']=visitor_id
            item['GAME_DT']=dt
            item['GAME_DY']=day
            item['AWAY_GAME']=1
            item['AWAY_SCORE_CT']=web.xpath("td[3]/text()").extract()
            item['MINUTES_GAME_CT']=fxs.xpath('//*[@id="gametime"]/text()').extract()
            item['AWAY_AB']=web.xpath("td[2]/span/text()").extract()
            item['AWAY_HITS']=web.xpath("td[4]/text()").extract()
            item['AWAY_DO']=fxs.xpath('//*[@id="2Bvisitor"]/text()').extract()
            item['AWAY_TR']=fxs.xpath('//*[@id="3Bvisitor"]/text()').extract()
            item['AWAY_RBI']=web.xpath("td[5]/text()").extract()
            item['AWAY_HBP']=fxs.xpath('//*[@id="HBPvisitor"]/text()').extract()
            item['AWAY_SB']=fxs.xpath('//*[@id="SBvisitor"]/text()').extract()
            item['AWAY_LOB']=fxs.xpath('//*[@id="teamlobvisitor"]/text()').extract()
            item['AWAY_PO']=web.xpath("td[5]/text()").extract()
            item['AWAY_ASS']=web.xpath("td[5]/text()").extract()
            item['AWAY_ERR']=fxs.xpath('//*[@id="linescore"]/strong[3]/text()').extract
            item['AWAY_PB']=fxs.xpath('//*[@id="PBvisitor"]/text()').extract()
            item['AWAY_DP']=fxs.xpath('//*[@id="DPvisitor"]/text()').extract()
            item['AWAY_TP']=fxs.xpath('//*[@id="TPvisitor"]/text()').extract()
            item['AWAY_First_Innings']=fxs.xpath('//*[@id="linescore"]/text()[3]').extract
            item['AWAY_IBB']=fxs.xpath('//*[@id="IBBvisitor"]/text()').extract()
            item['AWAY_BB']=web.xpath("td[6]/text()").extract()
            item['AWAY_SO']=web.xpath("td[7]/text()").extract()
            items.append(item)
        self.browser.close()
        return items

问题在于,当我执行脚本时,我在CMD提示符上显示的消息,已抓取的页面,已删除0项'。我不明白为什么这些物品没有被刮掉。任何帮助将不胜感激。

0 个答案:

没有答案