我在代码中使用了scrapy splash来生成javascript-html代码。
飞溅使我回到了render.html
{
"error": 400,
"type": "BadOption",
"description": "Incorrect HTTP API arguments",
"info": {
"type": "argument_required",
"argument": "url",
"description": "Required argument is missing: url"
}
}
并且我无法使用javascript生成的html进行响应。 这是我的spider.py
class ThespiderSpider(scrapy.Spider):
name = 'thespider'
#allowed_domains = ['https://www.empresia.es/empresa/repsol/']
start_urls = ['https://www.empresia.es/empresa/repsol/']
def start_requests(self):
yield scrapy.Request( 'http://example.com', self.fake_start_requests )
def fake_start_requests(self, response):
for url in self.start_urls:
yield SplashRequest( url, self.parse,
args={'wait': 1.5, 'http_method': 'POST'},
endpoint='render.html'
)
def parse(self, response):
open_in_browser(response)
title = response.css("title").extract()
# har = response.data["har"]["log"]["pages"]
headers = response.headers.get('Content-Type')
names = response.css('.fa-user-circle-o+ a::text').extract()
yield {
'title': title,
#'har': har,
'headers': headers,
'names': names,
'length': len(names)
}
这是我的设置。py
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Splash Settings
DOWNLOADER_MIDDLEWARES = {
# Engine side
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
# Downloader side
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
SPLASH_URL = 'http://127.0.0.1:8050/'
# SPLASH_URL = 'http://192.168.59.103:8050/'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
感谢您的帮助。
答案 0 :(得分:0)
在url
参数中提供args
参数,它将起作用。
yield SplashRequest(self.parse,
args={'url': url, 'wait': 1.5, 'http_method': 'POST'},
endpoint='render.html'
)