我是Scrapy的新手,我写了一个像下面这样的爬虫,但我不知道为什么parse_item不会被解析def中的回调调用。
欢迎任何帮助。提前谢谢。
class ManualSpider(Spider):
name = "manual"
allowed_domains = ["https://www.gumtree.com"]
start_urls = ['https://www.gumtree.com/flats-houses/london']
def parse_item(self, response):
# Create the loader using the response
l = ItemLoader(item=StackItem(), response=response)
l.add_xpath('title', '//main/div[2]/header/h1/text()', MapCompose(unicode.strip, unicode.title))
l.add_xpath('price', '//header/span/strong/text()', MapCompose(lambda i: i.replace(',', ''), float),
re='[,.0-9]+', )
l.add_xpath('description', '//p[@itemprop="description"]'
'[1]/text()', Join(), MapCompose(unicode.strip))
l.add_xpath('address', '//*[@itemtype="http://schema.org/'
'Place"][1]/text()', MapCompose(unicode.strip))
l.add_xpath('location', '//header/strong/span/text()', MapCompose(unicode.strip))
l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(
lambda i: urljoin(response.url, i)))
l.add_value('url', response.url)
l.add_value('project', "example")
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
yield l.load_item()
def parse(self, response):
# Get the next index URLs and yield Requests
next_selector = response.xpath('//*[@class="pagination-next"]//@href')
for url in next_selector.extract():
yield Request(urljoin(response.url, url))
# Get item URLs and yield Requests
item_selector = response.xpath('//div[@id="srp-results"]//article//@href')
for url in item_selector.extract():
if url != "":
print(urljoin(response.url, url))
yield Request(urljoin(response.url, url), callback=self.parse_item)
答案 0 :(得分:1)
它不起作用,因为您在string
中提供callback="parse_item
回调。
你应该给出一个函数实例,如:callback=self.parse_item
。
另请在allowed_domains
答案 1 :(得分:0)
将callback="parse_item"
更改为callback=self.parse_item