我正在使用Scrapy在python中创建一个Web搜寻器,并且我开发了以下代码:
import scrapy
from scrapy.selector import Selector
class SapoSpider(scrapy.Spider):
name = "imo"
allowed_domains = ["imovirtual.com"]
start_urls = ["https://www.imovirtual.com/arrendar/apartamento/lisboa/"]
custom_setting = {
'FEED_URI': './output.json'
}
def parse(self,response):
subpage_links = []
for i in response.css('div.offer-item-details'):
youritem = {
'Titulo':i.css('span.offer-item-title::text').extract(),
#'Titulo':i.css
'Preço':i.css('li.offer-item-price::text').re('[^\t\n]+'),
}
subpage_link = i.css('header[class=offer-item-header]
a::attr(href)').extract()
subpage_links.extend(subpage_link)
for subpage_link in subpage_links:
yield scrapy.Request(subpage_link, callback=self.parse_subpage, meta={'item':youritem})
def parse_subpage(self,response):
#for j in response.css('div.article-offer'):
#respo = Selector(response) # a HTML selector
youritem = response.meta.get('item')
youritem['Tipologia'] = response.xpath('//ul[@class="main-list"]//li[3]//span').extract()
youritem['Condição'] = response.xpath('//ul[@class="sub-list"]/li[3]/text()').extract()
youritem['Caracteristicas'] = response.css('ul.dotted-list li::text').extract()
yield youritem
当我打开json文件时,我发现与变量“ Tipologia”有关的数据采用以下格式:“ Tipologia”:[span strong T2 / strong / span]。我如何获得这些子弹(跨度高)?有什么建议吗?