我设法解析了广告列表,在AdvertItem
中添加了一些信息,然后使用AdvertLoader
加载此项。但是我无法弄清楚如何从项目页面详细信息中获取有关每个广告的一些额外信息,将这些附加信息放在同一个AdvertItem
对象中,然后使用AdvertLoader
加载包含所有信息的项目。 / p>
class AdvertLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip, remove_tags)
default_output_processor = Join()
class AdvertSpider(scrapy.Spider):
name = "adverts"
start_urls = [
"http://blablaadverts.com/",
]
adverts_list_xpath = '//table[@class="object-list-table"]/tbody/tr[@class="object-type-apartment"]'
advert_item_fields = {
'id': './@id',
'link': './/td[@class="object-name"]/h2[contains(@class, "object-title")]/a/@href',
'status': 'normalize-space(.//td[contains(@class, "object-media")]/div/p/a/span[contains(@class, '
'"sold-overlay-list")]/span/text())',
'state': './/td[@class="object-name"]/h2[contains(@class, "object-title")]/a/text()',
'city': './/td[@class="object-name"]/h2[contains(@class, "object-title")]/a/text()',
'zone': './/td[@class="object-name"]/h2[contains(@class, "object-title")]/a/text()',
'address': './/td[@class="object-name"]/h2[contains(@class, "object-title")]/a/text()',
'rooms': './/td[contains(@class, "object-rooms")]/text()',
'area': 'normalize-space(.//td[contains(@class, "object-m2")]/text())',
'price': 'normalize-space(.//td[contains(@class, "object-price")]/p/text())',
}
advert_details_xpath = '//table[contains(@class, "object-data-meta")]/tbody/tr'
advert_item_details_fields = {
'floor': './/td/text()',
'built_in_year': './/td/text()',
'condition': './/td/text()',
'ownership': './/td/text()',
'energy_level': './/td/text()',
}
contact_name = '//div[contains(@class, "object-article-contact")]/p[@class="fn"]/text()'
next_page = '//li[contains(@class, "next")]/a/@href'
def parse(self, response):
selector = Selector(response)
for advert in selector.xpath(self.adverts_list_xpath):
loader = AdvertLoader(AdvertItem(), selector=advert)
for field, xpath in self.advert_item_fields.iteritems():
loader.add_xpath(field, xpath)
# This request is not working as I expect.
yield scrapy.Request("http://blablaadverts.com/index.htmlnr=55&search_key=ca41231a29d2ab921aed02e864152c0e",
callback=self.parse_page2, meta={'loader': loader})
yield loader.load_item()
next_page = response.xpath(self.next_page).extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_page2(self, response):
selector = Selector(response)
loader = response.meta['loader'] # type: AdvertLoader
loader.selector = selector
loader.add_xpath('contact_name', self.contact_name)
# yield loader.load_item()
以下代码仅保存有关每个广告的信息,而不会从第二个项目详细信息页面中提供额外的详细信息。
如果我与parse_page2()
函数分开运行,则函数parse()
正在运行。
如何收集所有信息,然后才在加载程序中加载我的AdvertItem
对象?
答案 0 :(得分:0)
我不确定我是否正确。
但改变这部分代码
# This request is not working as I expect.
yield scrapy.Request("http://blablaadverts.com/index.htmlnr=55&search_key=ca41231a29d2ab921aed02e864152c0e",
callback=self.parse_page2, meta={'loader': loader})
yield loader.load_item()
到
# This request is not working as I expect.
scrapy.Request("http://blablaadverts.com/index.htmlnr=55&search_key=ca41231a29d2ab921aed02e864152c0e",
callback=self.parse_page2, meta={'loader': loader})
loader.load_item()
当所有信息都可用时,此函数中有yield
。
def parse_page2(self, response):
selector = Selector(response)
loader = response.meta['loader'] # type: AdvertLoader
loader.selector = selector
loader.add_xpath('contact_name', self.contact_name)
yield loader.load_item()