完成蜘蛛程序后,我无法从API下载数据,错误文件中没有显示任何内容,是否有人可以提供提示
import json
import scrapy
class SpidyQuotesSpider(scrapy.Spider):
name = 'spidyquotes'
quotes_base_url = 'https://www.olx.co.id/api/relevance/search?category=198&facet_limit=100&location=1000001&location_facet_limit=20&page=%s'
start_urls = [quotes_base_url % 1]
download_delay = 1.5
def parse(self, response):
data = json.loads(response.body)
for item in data.get('data', []):
yield {
'car_id': item.get('id'),
'car_name' : item.get('title'),
'price': item.get('price.value.currency.display'),
'user_id': item.get('user_id')
# 'user_name':
}
if data['has_next']:
next_page = data['page'] + 1
yield scrapy.Request(self.quotes_base_url % next_page)
答案 0 :(得分:1)
没有data['has_next']
,只有data['metadata']['next_page_url']
,因此您可以使用
url = data['metadata']['next_page_url']
if url:
yield scrapy.Request(url)
或使其更安全
metadata = data.get('metadata')
if metadata:
url = metadata.get('next_page_url')
if url:
yield scrapy.Request(url)
或者您可以使用try/except
try:
yield scrapy.Request(data['metadata']['next_page_url'])
except Exception as ex:
print("Ex:", ex)
无需创建项目即可运行的完整代码
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'spidyquotes'
quotes_base_url = 'https://www.olx.co.id/api/relevance/search?category=198&facet_limit=100&location=1000001&location_facet_limit=20&page=%s'
start_urls = [quotes_base_url % 1]
download_delay = 1.5
def parse(self, response):
data = json.loads(response.body)
for item in data.get('data', []):
yield {
'car_id': item.get('id'),
'car_name' : item.get('title'),
'price': item.get('price.value.currency.display'),
'user_id': item.get('user_id')
# 'user_name':
}
metadata = data.get('metadata')
if metadata:
url = metadata.get('next_page_url')
if url:
yield scrapy.Request(url)
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file as CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()