新手抓狂,在过去一周或更长时间里到处都在寻找解决我的问题的方法。我正在尝试在http://ufcstats.com/event-details/6420efac0578988b上抓取ufc 1的表格数据。
我的蜘蛛工作正常,它以字符串列表形式返回每个项目。例如: '优胜者':['Royce Gracie', “杰森·德卢西亚”, “罗伊斯·格雷西”, 'Gerard Gordeau', 'Ken Shamrock', “罗伊斯·格雷西”, “凯文·罗西尔”, 'Gerard Gordeau']} 当我输出到csv时,事件获胜者/失败者/其他统计信息仅作为一行的字符串列表输出。我想将每个项目元素输出到它自己的行中。我已经可以在大熊猫中解决此问题,但觉得它不必要地工作,并且我怀疑它能否很好地扩展。
希望能够像表中所示那样输出到csv。 Dunno,如果这应该在Spider本身,item / itemloaders或管道中完成。
似乎是一个常见问题,但还没能找到解决办法
尝试使用我的标准itemloader在项目输入处理器和/或输出处理器以及我在各种示例中可以从SO中找到的所有其他东西(但无法实现所需的输出)中的蜘蛛代码中的for循环进行迭代。能够解决其他先前的问题。卡住了,这里的任何帮助将不胜感激
#items.py
import scrapy
from scrapy.loader.processors import Identity, TakeFirst, Compose,
MapCompose, Join
def compact(s):
return s if s else None
class StatsItem(scrapy.Item):
# define the fields for your item here like:
event_name = scrapy.Field(input_processor=MapCompose(str.strip, compact), )
event_date = scrapy.Field(input_processor=MapCompose(str.strip, compact), )
event_loc = scrapy.Field(input_processor=MapCompose(str.strip, compact), )
attendance = scrapy.Field(input_processor=MapCompose(str.strip, compact), )
f_info = scrapy.Field(input_processor=MapCompose(str.strip, compact,),)
winner = scrapy.Field(input_processor=MapCompose(str.strip),)
loser = scrapy.Field(input_processor=MapCompose(str.strip),)
#spider code
import scrapy
from ..items import StatsItem
from scrapy.loader import ItemLoader
#from scrapy.loader.processors import Join, MapCompose, TakeFirst
class StatsSpider(scrapy.Spider):
name = 'stats'
allowed_domains = ['fcstats...']
start_urls = ['http://fcstats.../']
custom_settings = {
# specifies exported fields and order
'FEED_EXPORT_FIELDS':
['event_name','event_date','event_loc','attendance',
'winner',#'w_str', 'w_td', 'w_sub', 'w_pass', 'w_wclass', 'w_method', 'w_mthdtl', 'w_round', 'w_time',
'loser' ,#'l_str', 'l_td', 'l_sub', 'l_pass', 'l_wclass', 'l_method', 'l_mthdtl', 'l_round', 'l_time',
'f_info',]}
def parse(self, response):
rev_orderd_events = response.css('tr.b-statistics__table-row')[::-1]
# full event_links
# event_links = rev_orderd_events.css('i>a::attr(href)').extract()
# for url in event_links:
# yield scrapy.Request(url=event_links, callback=self.parse_event)
event_links = rev_orderd_events.css('i>a::attr(href)').extract_first()
yield scrapy.Request(url=event_links,callback=self.parse_event)
# follow links
def parse_event(self, response):
#sel = Selector(response)
pg = response.css('div.l-page__container')
#fights = response.css('tr.b-fight-details__table-row.b-fight-details__table-row__hover.js-fight-details-click')
#table = response.css('table.b-fight-details__table.b-fight-details__table_style_margin-top.b-fight-details__table_type_event-details.js-fight-table')
for match in pg:
il = ItemLoader(StatsItem(), response=response)
il.add_css('event_name','h2.b-content__title>span::text')
il.add_css('event_date','ul.b-list__box-list>li:nth-child(1)::text')
il.add_css('event_loc' ,'ul.b-list__box-list>li:nth-child(2)::text')
il.add_css('attendance','ul.b-list__box-list>li:nth-child(3)::text')
il.add_css('winner','p.b-fight-details__table-text:nth-child(odd)>a::text')
il.add_css('loser' ,'p.b-fight-details__table-text:nth-child(even)>a::text')
il.add_css('f_info', 'td p.b-fight-details__table-text::text')
yield il.load_item()
实际结果:
event_name event_date event_loc attendance winner loser f_info
UFC 1: The Beginning 12-Nov-93 Denver, Colorado, USA 2,800 Royce Gracie,Jason DeLucia,Royce Gracie,Gerard Gordeau,Ken Shamrock,Royce Gracie,Kevin Rosier,Gerard Gordeau Gerard Gordeau,Trent Jenkins,Ken Shamrock,Kevin Rosier,Patrick Smith,Art Jimmerson,Zane Frazier,Teila Tuli 1,0,1,0,1,0,2,0,Open Weight,SUB,Rear Naked Choke,1,1:44,3,1,1,0,1,0,1,0,Open Weight,SUB,Rear Naked Choke,1,0:52,0,0,0,0,1,0,2,0,Open Weight,SUB,Rear Naked Choke,1,0:57,11,0,0,0,0,0,0,0,Open Weight,KO/TKO,1,0:59,1,4,1,0,2,0,0,0,Open Weight,SUB,Heel Hook,1,1:49,0,0,1,0,0,0,2,0,Open Weight,SUB,Other,1,2:18,15,12,0,0,0,0,0,0,Open Weight,KO/TKO,1,4:20,3,0,0,0,0,0,0,0,Open Weight,KO/TKO,Kick,1,0:26
预期结果将更像:
event_name event_date event_loc attendance winner loser f_info
UFC 1: The Beginning 12-Nov-93 Denver, Colorado, USA 2,800 Royce Gracie, Gerard Gordeau, 1,0,1,0,1,0,2,0,Open Weight,SUB,Rear Naked Choke,1,1:44,
UFC 1: The Beginning 12-Nov-93 Denver, Colorado, USA 2,800 Jason DeLucia, Trent Jenkins 3,1,1,0,1,0,1,0,Open Weight,SUB,Rear Naked Choke,1,0:52 ....
*为清楚起见而编辑
答案 0 :(得分:1)
我已经在Scrapy工作了很多年,我发现这个Item类毫无用处并且非常令人困惑,特别是对于Scrapy的新手来说
在您的情况下,您需要在for循环中迭代赢家和输家元素,然后逐个屈服
class StatsSpider(scrapy.Spider):
name = 'stats'
allowed_domains = ['ufcstats.com']
start_urls = ['http://ufcstats.com/statistics/events/completed?page=all']
def parse(self, response):
rev_orderd_events = response.css('tr.b-statistics__table-row')[::-1]
event_links = rev_orderd_events.css('i>a::attr(href)').extract_first()
yield scrapy.Request(url=event_links,callback=self.parse_event)
# follow links
def parse_event(self, response):
pg = response.css('div.l-page__container')
for match in pg:
event_name = item.css("h2.b-content__title>span::text").extract_first()
event_date = item.css("ul.b-list__box-list>li:nth-child(1)").extract_first()
event_loc = item.css("ul.b-list__box-list>li:nth-child(2)::text").extract_first()
for item in match.css("p.b-fight-details__table-text:nth-child(odd)>a"):
winner = {}
winner['name'] = item.css("::text").extract_first()
winner['type'] = 'winner'
winner['event_name'] = event_name
winner['event_date'] = event_date
winner['event_loc'] = event_loc
yield winner
for item in match.css("p.b-fight-details__table-text:nth-child(even)>a"):
loser = {}
loser['name'] = item.css("::text").extract_first()
winner['type'] = 'loser'
loser['event_name'] = event_name
loser['event_date'] = event_date
loser['event_loc'] = event_loc
yield loser
答案 1 :(得分:0)
感谢@umair和@Catalina_Chircu
def parse_event(self, response):
pg = response.css('div.l-page__container')
for event in response.css('div.b-fight-details'):
event_name = pg.css('h2.b-content__title>span::text').extract_first()
event_date = event.css('ul.b-list__box-list>li:nth-child(1)::text').extract()
event_loc = event.css('ul.b-list__box-list>li:nth-child(2)::text').extract()
attendance = event.css('ul.b-list__box-list>li:nth-child(3)::text').extract()
for fights in event.css('tr')[1:]:
il = ItemLoader(StatsItem(), selector=fights)
il.add_value('event_name', event_name)
il.add_value('event_date', event_date)
il.add_value('event_loc', event_loc)
il.add_value('attendance', attendance)
il.add_css('winner', 'td.b-fight-details__table-col:nth-child(2) p.b-fight-details__table-text:nth-child(odd)>a::text')
il.add_css('loser', 'td.b-fight-details__table-col:nth-child(2) p.b-fight-details__table-text:nth-child(even)>a::text')
#il.add_css('f_info', ':nth-child(3) p.b-fight-details__table-text::text')
il.add_css('w_str' ,'td.b-fight-details__table-col:nth-child(3)>p:nth-child(odd)::text')
il.add_css('l_str' ,'td.b-fight-details__table-col:nth-child(3)>p:nth-child(even)::text')
il.add_css('w_td' ,'td.b-fight-details__table-col:nth-child(4)>p:nth-child(odd)::text')
il.add_css('l_td' ,'td.b-fight-details__table-col:nth-child(4)>p:nth-child(even)::text')
il.add_css('w_sub' ,'td.b-fight-details__table-col:nth-child(5)>p:nth-child(odd)::text')
il.add_css('l_sub' ,'td.b-fight-details__table-col:nth-child(5)>p:nth-child(even)::text')
il.add_css('w_pass','td.b-fight-details__table-col:nth-child(6)>p:nth-child(odd)::text')
il.add_css('l_pass','td.b-fight-details__table-col:nth-child(6)>p:nth-child(even)::text')
il.add_css('w_wclass','td.b-fight-details__table-col:nth-child(7)>p:nth-child(1)::text')
il.add_css('l_wclass','td.b-fight-details__table-col:nth-child(7)>p:nth-child(1)::text')
il.add_css('w_method','td.b-fight-details__table-col:nth-child(8)>p:nth-child(odd)::text')
il.add_css('l_method','td.b-fight-details__table-col:nth-child(8)>p:nth-child(odd)::text')
il.add_css('w_mthdtl','td.b-fight-details__table-col:nth-child(8)>p:nth-child(even)::text')
il.add_css('l_mthdtl','td.b-fight-details__table-col:nth-child(8)>p:nth-child(even)::text')
il.add_css('w_round','td.b-fight-details__table-col:nth-child(9)>p:nth-child(odd)::text')
il.add_css('l_round','td.b-fight-details__table-col:nth-child(9)>p:nth-child(odd)::text')
il.add_css('w_time','td.b-fight-details__table-col:nth-child(10)>p:nth-child(odd)::text')
il.add_css('l_time','td.b-fight-details__table-col:nth-child(10)>p:nth-child(odd)::text')
yield il.load_item()'
与输入/输出处理器相关的项目给了我大部分希望的东西