我不太明白,我不断得到索引错误,说明超出范围。是因为列表链接是空的吗?这里写链接的格式是什么
class POSpider(CrawlSpider):
name = 'po'
start_urls = ['https://www.poets.org/poetsorg/poems']
allowed_domains = ['poets.org/poetsorg/poems']
def parse(self, response):
items=[]
l=response.xpath('//*[@class="themes"]//a//@href').extract()
theme_ids=[]
for item in l:
theme_ids.append(item[855:1412])
theme_urls=[]
for tid in theme_ids:
theme_urls.append('https://www.poets.org/poetsorg/poems? field_occasion_tid=All&field_poem_themes_tid='+ tid)
for link in theme_urls:
request=scrapy.Request(link,callback=self.parse_layer2,dont_filter=True)
yield request
def parse_layer2(self,response):
items=[]
p=response.xpath('//*[@id="block-views-poems-poems-block-all"]/div/div//tbody//td[2]//@href')[-1].extract()
poem_urls=[]
for item in p:
poem_urls.append(item)
for link in poem_urls:
request=scrapy.Request(link,callback=self.parse_layer3,dont_filter=True)
yield request
def parse_layer3(self,response):
items=[]
poems=response.xpath('//*[@id="poem-content"]/div[2]/div/div').extract()
for poem in poems:
item=PoetryItem()
s=poem.xpath('*/p/text()').extract()
t=strip_list(s)
t=t.encode('ascii','replace').lower()+'\r\n'
item['poem']=t
items.append(item)
return items
这就是我一直得到的结果。
Traceback (most recent call last):
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Users/chinouhane/Desktop/Org/Org/spiders/PO_spider.py", line 37, in parse_layer2
p=response.xpath('//*[@id="block-views-poems-poems-block-all"]/div/div//tbody//td[2]//@href')[-1].extract()
File "//anaconda/lib/python2.7/site-packages/parsel/selector.py", line 56, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range