我正在尝试下载以下文件,这些文件是从我正在抓取的网站上获得的:
urllib.urlretrieve似乎不适用于.aspx。有什么建议吗?
答案 0 :(得分:0)
该站点需要Javascript才能下载文件,但是您可以尝试以下脚本:
product_id
打印:
SELECT
id
, product_id
, url
, is_primary
FROM table
WHERE is_primary
import re
from math import pow, pi, cos
import requests
# algorithm used by site to compute challenge headers:
def test(Challenge):
var_arr = [int(c) for c in str(Challenge)][::-1]
LastDig = var_arr[0]
var_arr.sort()
minDig = min(var_arr)
subvar1 = (2 * (var_arr[2])) + (var_arr[1] * 1)
subvar2 = int(str((2 * var_arr[2])) + str(var_arr[1]))
my_pow = pow(((var_arr[0] * 1) + 2), var_arr[1])
x = (Challenge * 3 + subvar1) * 1
y = cos(pi * subvar2)
answer = x * y
answer -= my_pow * 1
answer += (minDig * 1) - (LastDig * 1)
answer = str(int(answer)) + str(subvar2)
return(answer)
url = 'http://mavat.moin.gov.il/mavatps/forms/Attachment.aspx?edid=6000405287445&edn=8F90EFA829F078A90C93EAE032F3A079636EBC6FCFC3BC74C87EAF3A9A0E9E4B&opener=AttachmentError.aspx'
with requests.session() as s:
text = s.get(url).text
Challenge, ChallengeId = int(re.findall(r'Challenge=(\d+);', text)[0]), int(re.findall(r'ChallengeId=(\d+);', text)[0])
headers = {'X-AA-Challenge-ID':str(ChallengeId),
'X-AA-Challenge-Result':str(test(Challenge)),
'X-AA-Challenge': str(Challenge),
'Content-Type': 'text/plain'}
r = s.post(url, headers=headers)
r = s.get(url)
filename = re.findall(r'filename=(.*)', r.headers['Content-Disposition'])[0]
print('Writing {}'.format(filename))
with open(filename, 'wb') as f_out:
f_out.write(r.content)
的内容为:
Writing KML_2000972605.kml