我正在尝试从网址中的网页中提取章节标题及其字幕。这是我的蜘蛛
import scrapy
from ..items import ContentsPageSFBItem
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
#allowed_domains = ["web"]
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = ContentsPageSFBItem()
item['content_item'] = response.xpath('normalize-space(//ol[@class="detail-toc"]//*/text())').extract();
length = len(response.xpath('//ol[@class="detail-toc"]//*/text()').extract()); #extract()
full_url_list = list();
title_list = list();
for i in range(1,length+1):
full_url_list.append(response.url)
item["full_url"] = full_url_list
title = response.xpath('//title[1]/text()').extract();
for j in range(1,length+1):
title_list.append(title)
item["title"] = title_list
return item
即使我在xpath中使用normalize fucntion来删除空格,我在csv中得到以下结果
content_item,full_url,title
"
,Chapter 1,
,
,
,Instructor Introduction,
,00:01:00,
,
,
,Course Overview,
如何在每次进入后最多只获得一个新行?
答案 0 :(得分:1)
如果您想获取Table of Contents
部分中的所有文字,则需要将item['content_item']
中的xpath表达式更改为:
item['content_item'] = response.xpath('//ol[@class="detail-toc"]//a/text()').extract()
你可以像这样重写你的蜘蛛代码:
import scrapy
class BasicSpider(scrapy.Spider):
name = "contentspage_sfb"
start_urls = [
'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/',
]
def parse(self, response):
item = dict() # change dict to your scrapy item
for link in response.xpath('//ol[@class="detail-toc"]//a'):
item['link_text'] = link.xpath('text()').extract_first()
item['link_url'] = response.urljoin(link.xpath('@href').extract_first())
yield item
# Output:
{'link_text': 'About This E-Book', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/pref00.html#pref00'}
{'link_text': 'Title Page', 'link_url': 'https://www.safaribooksonline.com/library/view/shell-programming-in/9780134496696/title.html#title'}