我正在使用scrapy,splash和scrapy_splash来抓取目录网站。
网站使用表单POST打开新项目详细信息页面。
有时,项目详细信息页面会在Splash中显示默认错误页面(与HTTP状态无关),但如果我再次重新发布表单,则会返回项目详细信息。我仍在调查回应的根本原因。在n个请求之后,它似乎更像是一个计时问题而不是特定的检查。
作为一种解决方法,我使用splash:on_response方法在收到错误页面时重试表单发布。
我希望能够记录失败的尝试以便稍后进行手动处理。是否有收集此信息的最佳做法或标准方法?
function main(splash)
if splash.args.cookies then
splash:init_cookies(splash.args.cookies)
end
function web_request()
if splash.args.http_method == 'GET' then
assert(splash:go{
url=splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
else
assert(splash:go{
url=splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
formdata=splash.args.formdata,
})
end
end
--- AREA OF THE CODE UNDER QUESTION
local retry_max = 3
local retry_count = 0
splash:on_response(function (response)
if string.find(response.url, 'error_check.html') ~= nil then
if retry_count <= retry_max then
retry_count = retry_count + 1
web_request()
else
--- Not sure how to capture this in the item pipeline
--- Also, I would like to capture the form post details
--- such as the form data and headers
error('Max retry exceeded' .. response)
end
end
end)
web_request()
assert(splash:wait(0.5))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
cookies = splash:get_cookies(),
html = splash:html(),
har = splash:har(),
retry_count = retry_count
}
end