我正在尝试抓取多个表,这些表的表名存储在h3标记下。有数据列我可以没有问题,当我提供下一个网址时,我可以将此数据附加到csv文件。 我无法解决的问题是获取表头并将其相对于表的每一行存储。这样做的原因是当下一个表被输入时我需要知道它属于哪个表。是否可以使用len循环我们说'Round'来建立表长度然后将表头写入每一行?是否可以处理项目导出?
这是我的代码 spider.py
from bigcrawler.items import BigcrawlerItem
from scrapy import Spider, Request, Selector
from scrapy.selector import Selector
from bigcrawler.items import MatchStatItemLoader
class CrawlbotSpider(Spider):
name = 'bigcrawler'
allowed_domains = ['www.matchstat.com']
start_urls = [
'https://matchstat.com/tennis/tournaments/w/Taipei/2015',
'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
]
def parse_header(self , response):
hxs = Selector(response)
for tb in hxs.css('tr.match'):
heading = tb.xpath('//*[@id="AWS"]/div/h3/text()').extract()[0]
for td in tb.xpath(".//tr[contains(@class,
'match')]/td[contains(@class, 'round')]/text()"):
il = BigcrawlerItem(selector=td)
il.add_value('event_title' , heading)
yield il.load_item()
def parse(self , response):
for row in response.css('tr.match'):
il = MatchStatItemLoader(selector=row)
il.add_css('round' , '.round::text')
il.add_css('event1' , '.event-name a::text')
il.add_css('player_1' , '.player-name:nth-child(2) a::text')
il.add_css('player_2' , '.player-name:nth-child(3) a::text')
il.add_css('player_1_odds' , '.odds-td.odds-0
[payout]::text')
il.add_css('player_2_odds' , '.odds-td.odds-1
[payout]::text')
il.add_css('h_2_h' , 'a.h2h::text')
yield il.load_item()
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from operator import methodcaller
from scrapy import Spider, Request, Selector
class BigcrawlerItem(scrapy.Item):
# define the fields for your item here like:
event_title = scrapy.Field()
round = scrapy.Field()
event1 = scrapy.Field()
player_1 = scrapy.Field()
player_2 = scrapy.Field()
player_1_odds = scrapy.Field()
player_2_odds = scrapy.Field()
h_2_h = scrapy.Field()
class MatchStatItemLoader(ItemLoader):
default_item_class = BigcrawlerItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
答案 0 :(得分:1)
如果只有其中一个标题,您不需要相对于当前节点,请尝试以下操作:
il.add_xpath('event_title', '//*[@id="AWS"]//h3/text()')
但如果您需要它相对于当前节点,您也可以这样做:
il.add_xpath('event_title', './ancestor::*[@id="AWS"]//h3/text()')
答案 1 :(得分:0)
我建议完全不要使用Items
类并使用start_requests
方法而不是start_urls
,因为它们确实令人困惑。请在此处查看完整的代码。另请注意match_heading
变量。
class CrawlbotSpider(Spider):
name = 'bigcrawler'
allowed_domains = ['www.matchstat.com']
start_urls = [
'https://matchstat.com/tennis/tournaments/w/Taipei/2015',
'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
]
def start_requests(self):
match_urls = [
'https://matchstat.com/tennis/tournaments/w/Taipei/2015',
'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
]
for url in match_urls:
yield Request(url=url, callback=self.parse_matches)
def parse_matches(self , response):
match_heading = response.xpath('//*[@id="AWS"]/div/h3/text()').extract_first()
for row in response.css('tr.match'):
match = {}
match['heading'] = match_heading
match['round'] = row.css(".round::text").extract_first()
match['event1'] = row.css(".event-name a::text").extract_first()
match['player_1'] = row.css(".player-name:nth-child(2) a::text").extract_first()
match['player_2'] = row.css(".player-name:nth-child(3) a::text").extract_first()
match['player_1_odds'] = row.css(".odds-td.odds-0 [payout]::text").extract_first()
match['player_2_odds'] = row.css(".odds-td.odds-1 [payout]::text").extract_first()
match['h_2_h'] = row.css("a.h2h::text::text").extract_first()
yield match