我正在尝试构建Scrapy Spider,以解析艺术家并跟踪SoundCloud中的信息。
使用FireFox中的开发人员工具,我确定可以进行API调用,该API返回返回转换为python字典的JSON对象。此API调用需要一个演出者ID,据我所知,这些ID已自动递增。这意味着我无需爬网该站点,只需列出启动URL即可进行初始API调用,然后解析随后的页面。我相信这应该使我对网站更加友好?
从返回的响应中可以获得艺术家的URL,访问并解析该URL将提供有关艺术家的更多信息
从艺术家的URL中,我们可以访问他们的曲目,并在曲目的属性旁边抓取曲目列表。
我认为我遇到的问题源于对Scrapy框架的不了解... 如果我直接在歌手的URL中输入start_urls,则Scrapy将scrapy.http.response.html.HtmlResponse对象传递给parse_artist。这使我可以提取所需的数据(我没有包括分析页面的所有代码以使代码段更短)。但是,如果我从parse_api_call函数将同一对象传递给同一函数,则会导致错误...
我不明白为什么会这样,任何帮助将不胜感激。
侧面说明: 最初的API调用从艺术家那里获取曲目,并且可以更改偏移量和限制,并以递归方式调用该函数以收集曲目。但是,这已被证明是不可靠的,即使它没有导致终止程序的错误,也无法从艺术家那里获得曲目的完整列表。
这是当前代码:
"""
Scrapes SoundCloud websites for artists and tracks
"""
import json
import scrapy
from ..items import TrackItem, ArtistItem
from scrapy.spiders.crawl import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
class SoundCloudBot(scrapy.Spider):
name = 'soundcloudBot'
allowed_domains = ['soundcloud.com']
start_urls = [
'https://api-v2.soundcloud.com/users/7436630/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/4803918/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/17364233/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/19697240/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/5949564/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en'
]
# This is added for testing purposes. When these links are added directly to the
# start_urls the code runs as expected, when these links are extracted using parse_api_call
# is when problems arise
# start_urls.extend([
# 'https://soundcloud.com/futureisnow',
# 'https://soundcloud.com/bigsean-1',
# 'https://soundcloud.com/defjam',
# 'https://soundcloud.com/ymcmbofficial',
# 'https://soundcloud.com/walefolarin',
# # 'https://soundcloud.com/futureisnow/tracks',
# # 'https://soundcloud.com/bigsean-1/tracks',
# # 'https://soundcloud.com/defjam/tracks',
# # 'https://soundcloud.com/ymcmbofficial/tracks',
# # 'https://soundcloud.com/walefolarin/tracks'
# ])
def parse(self, response):
url = response.url
if url[:35] == 'https://api-v2.soundcloud.com/users':
self.parse_api_call(response)
# 'https://soundcloud.com/{artist}'
elif url.replace('https://soundcloud.com', '').count('/') == 1: # One starting forward slash for artist folder
self.parse_artist(response)
# 'https://soundcloud.com/{artist}/{track}'
elif url.replace('https://soundcloud.com', '').count('/') == 2 and url[-6:] == 'tracks':
self.parse_tracks(response)
def parse_api_call(self, response):
data = json.loads(response.text)
artistItem = ArtistItem()
first_track = data['collection'][0]
artist_info = first_track.get('user')
artist_id = artist_info.get('id')
artist_url = artist_info.get('permalink_url')
artist_name = artist_info.get('username')
artistItem['artist_id'] = artist_id
artistItem['username'] = artist_name
artistItem['url'] = artist_url
artist_response = scrapy.http.response.html.HtmlResponse(artist_url)
self.parse_artist(artist_response)
# Once the pipelines are written this will be changed to yeild
return artistItem
def parse_artist(self, response):
# This prints out <class 'scrapy.http.response.html.HtmlResponse'>
# It doesn't matter if start_urls get extend with artists' URLS or not
print(type(response))
data = response.css('script::text').extract()
# This prints out a full HTML response if the function is called directly
# With scrapy, or an empty list if called from parse_api_call
print(data)
track_response = scrapy.http.response.html.HtmlResponse(f'{response.url}/tracks')
self.parse_tracks(track_response)
def parse_tracks(self, response):
pass
答案 0 :(得分:2)
您必须使用
Request(url)
从新网址获取数据。但是您不能将其作为常规函数执行并立即获得结果。您必须使用return Request()
或yield Request()
并将scrapy放入队列中以便稍后获取数据。
获取数据后,它使用方法parse()
来解析响应中的数据。但是您可以在请求中设置自己的方法
Request(url, self.parse_artist)
但是在parse_artist()
中,您将无法访问在上一个函数中获得的数据,因此您必须使用meta
在请求中发送数据,即。
Request(artistItem['url'], self.parse_artist, meta={'item': artistItem})
完整的工作代码。您可以将所有文件放在一个文件中并运行它,而无需创建项目。
它还将结果保存到output.csv
import scrapy
from scrapy.http import Request
import json
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['soundcloud.com']
start_urls = [
'https://api-v2.soundcloud.com/users/7436630/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/4803918/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/17364233/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/19697240/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en',
'https://api-v2.soundcloud.com/users/5949564/tracks?offset=0&limit=20&client_id=Q11Oe0rIPEuxvMeMbdXV7qaowYzlaESv&app_version=1556892058&app_locale=en'
]
def parse(self, response):
data = json.loads(response.text)
if len(data['collection']) > 0:
artist_info = data['collection'][0]['user']
artistItem = {
'artist_id': artist_info.get('id'),
'username': artist_info.get('username'),
'url': artist_info.get('permalink_url'),
}
print('>>>', artistItem['url'])
# make requests to url artistItem['url'],
# parse response in parse_artist,
# send artistItem to parse_artist
return Request(artistItem['url'], self.parse_artist, meta={'item': artistItem})
else:
print("ERROR: no collections in data")
def parse_artist(self, response):
artistItem = response.meta['item']
data = response.css('script::text').extract()
# add data to artistItem
#print(data)
artistItem['new data'] = 'some new data'
#print('>>>', response.urljoin('tracks'))
print('>>>', response.url + '/tracks')
# make requests to url artistItem['url'],
# parse response in parse_tracks,
# send artistItem to parse_tracks
return Request(response.url + '/tracks', self.parse_tracks, meta={'item': artistItem})
def parse_tracks(self, response):
artistItem = response.meta['item']
artistItem['tracks'] = 'some tracks'
# send to CSV file
return artistItem
#------------------------------------------------------------------------------
# run it without creating project
#------------------------------------------------------------------------------
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
# save in file as CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()
ouput.csv
artist_id,username,url,new data,tracks
17364233,Def Jam Recordings,https://soundcloud.com/defjam,some new data,some tracks
4803918,Big Sean,https://soundcloud.com/bigsean-1,some new data,some tracks
19697240,YMCMB-Official,https://soundcloud.com/ymcmbofficial,some new data,some tracks
5949564,WALE,https://soundcloud.com/walefolarin,some new data,some tracks