所以,假设我想写一个使用Facebook API来计算网站每个页面上的喜欢的蜘蛛。如果我导入请求库,我可以按如下方式调用Facebook图形API。
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
data=requests.get(base)
return self.parse_likes(data)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
item['url'] = response.url
links = response.css('a::attr(href)').extract()
item['fb_url'],item['shares'],item['comments'] = self.get_likes(response.url)
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
但是,如果我使用scrapy.Request调用,而不是使用请求,我似乎无法使用此代码。像这样的东西。
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
return scrapy.Request(base,callback=self.parse_likes)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
links = response.css('a::attr(href)').extract()
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
在这种情况下,我只是得到了Facebook数据的空白回复。我想我对scrapy.Request方法相对于标准请求库的工作方式缺乏了解。有任何想法吗?
答案 0 :(得分:3)
这是一个非常常见的情况:如何从多个网址中提取项目?
最常见的解决方案是通过在request.meta
参数中携带您的项目来链接请求。
对于使用此逻辑的示例实现,可能如下所示:
class WebSite(scrapy.Spider):
base='https://graph.facebook.com/{}?access_token={}'.format
api_key = '1234'
def parse(self, response):
links = response.css('a::attr(href)').extract()
for link in links:
item= {}
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
item['link'] = response.urljoin(link)
api_url = self.base(self.api_key, link)
yield scrapy.Request(api_url,
callback=self.parse_likes,
meta={'item': item})
def parse_likes(self, response):
item = response.meta['item']
data = json.loads(data.text)
share_count = data['id'],data['share']['comment_count'],data['share']['share_count']
item['share_count'] = share_count
yield item