我正试图抓住这个网站:http://stats.swehockey.se/ScheduleAndResults/Schedule/3940
我已经到目前为止(感谢alecxe)检索日期和团队。
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[@class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
yield item
所以,我的下一步是过滤掉任何不是“AIK”或“DjurgårdensIF”主场比赛的东西。之后我需要重新格式化为.ics文件,我可以将其添加到Google Calender。
编辑:所以我已经解决了一些问题,但还有很多工作要做。我的代码现在看起来像这样..# -*- coding: UTF-8 -*-
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[@class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
for string in item['teams']:
teams = string.split('-') #split it
home_team = teams[0]#.split(' ') #only the first name, e.g. just 'Djurgårdens' out of 'Djurgårdens IF'
away_team = teams[1]
#home_team[0] = home_team[0].replace(" ", "") #remove whitespace
#home_team = home_team[0]
if "AIK" in home_team:
for string in item['date']:
year = string[0:4]
month = string[5:7]
day = string[8:10]
hour = string[11:13]
minute = string[14:16]
print year, month, day, hour, minute, home_team, away_team
elif u"Djurgårdens" in home_team:
for string in item['date']:
year = string[0:4]
month = string[5:7]
day = string[8:10]
hour = string[11:13]
minute = string[14:16]
print year, month, day, hour, minute, home_team, away_team
该代码打印出“AIK”,“DjurgårdensIF”和“SkellefteåAIK”的游戏。所以我的问题显然是如何过滤掉“SkellefteåAIK”游戏,如果有任何简单的方法可以让这个程序更好。对此的想法?
祝你好运!
答案 0 :(得分:1)
我只是猜测主场比赛是你首先要寻找的球队(在破折号之前)。
您可以在XPath或python中执行此操作。如果要在XPath中执行此操作,请仅选择包含主组名称的行。
//table[@class="tblContent"]/tr[
contains(substring-before(.//td[3]/text(), "-"), "AIK")
or
contains(substring-before(.//td[3]/text(), "-"), "Djurgårdens IF")
]
您可以保存删除所有空格(包括换行符),我只是添加了它们以便于阅读。
对于python,你应该可以做同样的事情,甚至可以使用一些正则表达式更简洁。
答案 1 :(得分:1)
需要注意几点:
string
是内置类型,因此避免将其用于自己的变量通常是一种好习惯home_team
足以与所需“AIK”进行直接比较的方法。我在string.strip()
和home_team
使用了away_team
,因为它比string.replace(" ", "")
更清洁,但这是个人的事情print
行中的主队和客队之间添加了一个“:”,以便在我测试时更清楚地区分它们,所以随意摆脱这种变化如果有任何其他问题,请检查并告诉我。 :)
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[@class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
for fixture in item['teams']:
teams = fixture.split('-') #split it
home_team = teams[0].strip()
away_team = teams[1].strip()
if home_team == "AIK":
for fixDate in item['date']:
year = fixDate[0:4]
month = fixDate[5:7]
day = fixDate[8:10]
hour = fixDate[11:13]
minute = fixDate[14:16]
print year, month, day, hour, minute, home_team, ":", away_team
elif home_team == u"Djurgårdens IF":
for fixDate in item['date']:
year = fixDate[0:4]
month = fixDate[5:7]
day = fixDate[8:10]
hour = fixDate[11:13]
minute = fixDate[14:16]
print year, month, day, hour, minute, home_team, ":", away_team