我正在试图刮掉baseball-reference.com。在我创建的Scrapy bot中,我从第1页开始,导航到不同的链接,然后从那里导航到第三个链接。请找到以下代码:
class VisitorBattingSpider(InitSpider):
name = 'VisitorBatting'
year=str(datetime.datetime.today().year)
allowed_domains = ["baseball-reference.com"]
start= 'http://www.baseball-reference.com/boxes/'+year+'.shtml'
start_urls=[start]
#rules = [Rule(LinkExtractor(allow=['/play-index/st.cgi?date=\d+-\d+-\d+']), callback='parse_item',)]
def __init__(self):
BaseSpider.__init__(self)
# use any browser you wish
self.browser = webdriver.Firefox()
def __del__(self):
self.browser.close()
def parse(self, response):
self.browser.get(response.url)
# let JavaScript Load
time.sleep(15)
page=Selector(text=self.browser.page_source)
#page=Selector(response)
sites=page.xpath('//*[@id="2016"]/tbody/tr/td/table/tbody/tr/td/a/@href')
for site in sites:
tree = site.extract()
yield Request(url='http://www.baseball-reference.com'+tree,callback=self.parse_new,dont_filter=True)
self.browser.close()
def parse_new(self, response):
hxs=Selector(response)
loads = hxs.xpath('/html/body/pre/a/@href')
for load in loads:
branch=load.extract()
if 'boxes' in branch:
yield Request(url='http://www.baseball-reference.com'+branch,callback=self.parse_final,dont_filter=True)
def parse_final(self, response):
self.browser.get(response.url)
fxs=Selector(text=self.browser.page_source)
vi= fxs.xpath('html/body/div/div[3]/div[1]/div[1]/h3/text()').extract()
vis=''.join(vi)
if "." in vis:
visitor=vis.replace(".","")
else:
visitor=vis
visitor_id=visitor.replace(" ","")
print visitor_id
UR=response.url
URL=''.join(UR)
dtt=URL[-15:]
dt=dtt[:8]
day=datetime.datetime(int(dt[:4]),int(dt[5:6]),int(dt[-2:]),01,01,01).weekday()
path = '//*[@id="'+visitor_id+'batting"]/tfoot/tr'
webs=fxs.xpath(path)
items=[]
for web in webs:
item=VisitorbattingItem()
item['ID']=response.url
item['AWAY_TEAM']=visitor_id
item['GAME_DT']=dt
item['GAME_DY']=day
item['AWAY_GAME']=1
item['AWAY_SCORE_CT']=web.xpath("td[3]/text()").extract()
item['MINUTES_GAME_CT']=fxs.xpath('//*[@id="gametime"]/text()').extract()
item['AWAY_AB']=web.xpath("td[2]/span/text()").extract()
item['AWAY_HITS']=web.xpath("td[4]/text()").extract()
item['AWAY_DO']=fxs.xpath('//*[@id="2Bvisitor"]/text()').extract()
item['AWAY_TR']=fxs.xpath('//*[@id="3Bvisitor"]/text()').extract()
item['AWAY_RBI']=web.xpath("td[5]/text()").extract()
item['AWAY_HBP']=fxs.xpath('//*[@id="HBPvisitor"]/text()').extract()
item['AWAY_SB']=fxs.xpath('//*[@id="SBvisitor"]/text()').extract()
item['AWAY_LOB']=fxs.xpath('//*[@id="teamlobvisitor"]/text()').extract()
item['AWAY_PO']=web.xpath("td[5]/text()").extract()
item['AWAY_ASS']=web.xpath("td[5]/text()").extract()
item['AWAY_ERR']=fxs.xpath('//*[@id="linescore"]/strong[3]/text()').extract
item['AWAY_PB']=fxs.xpath('//*[@id="PBvisitor"]/text()').extract()
item['AWAY_DP']=fxs.xpath('//*[@id="DPvisitor"]/text()').extract()
item['AWAY_TP']=fxs.xpath('//*[@id="TPvisitor"]/text()').extract()
item['AWAY_First_Innings']=fxs.xpath('//*[@id="linescore"]/text()[3]').extract
item['AWAY_IBB']=fxs.xpath('//*[@id="IBBvisitor"]/text()').extract()
item['AWAY_BB']=web.xpath("td[6]/text()").extract()
item['AWAY_SO']=web.xpath("td[7]/text()").extract()
items.append(item)
self.browser.close()
return items
问题在于,当我执行脚本时,我在CMD提示符上显示的消息,已抓取的页面,已删除0项'。我不明白为什么这些物品没有被刮掉。任何帮助将不胜感激。