import scrapy
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class NextlinkSpider(scrapy.Spider):
name = 'nextlink'
allowed_domains = ['www.shanazrafiq.com/']
start_urls = ['https://www.shanazrafiq.com/']
def parse(self, response):
yield Request(url='https://www.shanazrafiq.com/p/recipes.html',callback = self.parse_dir_contents)
错误:AttributeError:' NextlinkSpider'对象没有属性' parse_dir_contents'
尝试从基本网址抓取到另一个网址
我希望通过从第一页导航来获取食谱页面标记内容,但是无法调用回调函数,这样做会说明我找不到属性错误。帮我解决这个问题。提前致谢
def parse_dir_contents(self, response): # second function to be called.
sel = Selector(response)
title_name=sel.css("div.widget HTML h2::text").extract()
print title_name
答案 0 :(得分:1)
根据您提供的错误消息进行猜测,您的NextlinkSpider
课程没有parse_dir_contents
方法。我猜你已经在类之外(或其他地方)定义了一个具有相同名称的函数。
我建议尝试"粘贴"这两个在一起:
class NextlinkSpider(scrapy.Spider):
name = 'nextlink'
allowed_domains = ['www.shanazrafiq.com/']
start_urls = ['https://www.shanazrafiq.com/']
def parse(self, response):
yield Request(url='https://www.shanazrafiq.com/p/recipes.html',callback = self.parse_dir_contents)
def parse_dir_contents(self, response): # second function to be called.
sel = Selector(response)
title_name=sel.css("div.widget HTML h2::text").extract()
print title_name
代码没有变化(缩进除外)
答案 1 :(得分:0)
我做了一些更改,以下代码可以让您走上正确的轨道。这将使用scrapy.CrawlSpider
并按照start_urls
页面上的所有食谱链接。它将在每个单独的食谱页面上提取标题,网址和图像网址。希望这会有所帮助:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
class NextlinkItem(scrapy.Item):
title_name = scrapy.Field()
url = scrapy.Field()
image_urls = scrapy.Field()
class NextlinkSpider(CrawlSpider):
name = 'nextlink'
allowed_domains = ['shanazrafiq.com']
start_urls = ['https://www.shanazrafiq.com/p/recipes.html']
# Xpath for selecting links to follow
xp = '//div[contains(@class, "post-body")]/div/div/div/h4/ul/li/a'
rules = (
Rule(LinkExtractor(restrict_xpaths=xp), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = NextlinkItem()
item['title_name'] = response.xpath('//div[contains(@class, "post-outer")]/div/h3/text()').extract_first().strip()
item['url'] = response.url
item['image_urls'] = response.xpath('//div[contains(@class, "post-outer")]/div/meta[@itemprop="image_url"]/@content').extract()
yield item