所以我有大约6600个图像URL,我想从pickle文件中读取它们,以便在本地下载它们的图像。我无法解决的两个主要问题:
1)抛出UnicodeEncodeError
错误的URL。我在网上尝试了很多可用的解决方案,没有一个工作,所以我最终忘记了这个网址,但我更喜欢,如果有办法破解它。例如,这是包含Polaska字符中文的URL。
2)另一个错误如下(一个例子):
ssl.CertificateError: hostname 'www.straitstimes.com.sg' doesn't match either of 'gp1.adn.edgecastcdn.net', 'gs1.adn.edgecastcdn.net', 'ne1.adn.edgecastcdn.net', 'www.uship.com', 'www.bluefly.com', 'www.belleandclive.com', 'is.belleandclive.com', 'connections.cochlear.com', 'assets.pokemon.com', 'www.shopperschoice.com', 'www.biznessapps.com', 'cdn.shocho.co', 'secure.hibustudio.com', 'www.stardoll.com', 'adn.wiredrive.com', 'www.speedtest.net', 'www.cduniverse.com', 'ak-site-origin-cover.cduniverse.com', 'cover.cduniverse.com', 'g.cduniverse.com', 'www.renttherunway.com', 'cdn2.navexglobal.com', 'www.chdist.com', 'www.thefanorama.com', 'cdn2.mediasilo.com', 'cdn.citadoncw.com', 'www.woodcraft.com', 'marketing-admin.upsight-api.com', 'www.edgecast.com'
当我看到图片可用时,我需要从https://static.straitstimes.com.sg/sites/default/files/styles/x_large/public/articles/2017/09/23/ST_20170923_WONZ_3440048.jpg?itok=-iG-zSvr下载图片。
我想知道如何修改下面的代码来克服这两个问题,而不是尝试除了短语。
import pickle
import urllib.request
import re
import requests
from urllib.parse import quote
# img = urllib.request.urlopen(quote(value))
# img = urllib.urlopen(quote(value))
import time
count = 0
with open('images.pickle', 'rb') as handle:
b = pickle.load(handle)
print(len(b))
for key, value in b.items():
print(value)
print(key, value)
#value = iriToUri(value)
if value != 'NA':
count += 1
print(value)
try:
img = urllib.request.urlopen(quote(value, "\./_-:"))
#img = requests.get(value)
split = urllib.parse.urlsplit(value)
extension = split.path.split(".")[-1]
print(extension)
if extension.lower() == 'jpg':
filename = "immigration_images/" + str(key) + ".jpg"
elif extension.lower() == 'jpeg':
filename = "immigration_images/" + str(key) + ".jpeg"
elif extension.lower() == 'ico':
filename = "immigration_images/" + str(key) + ".ico"
else:
filename = "immigration_images/" + str(key) + ".png"
img_extension = img.info()['Content-Type']
#print(img_extension)
if img_extension:
if 'jpeg' in img_extension:
filename = "immigration_images/" + str(key) + ".jpeg"
elif 'jpg' in img_extension:
filename = "immigration_images/" + str(key) + '.jpg'
elif 'png' in img_extension:
filename = "immigration_images/" + str(key) + '.png'
#urllib.request.urlretrieve(value, filename)
urllib.request.urlretrieve(value, filename)
except (urllib.error.ContentTooShortError, urllib.error.URLError, urllib.error.HTTPError, UnicodeEncodeError) as e:
print(e)
continue
print(count)
我在Python 3.6.3 :: Anaconda custom (64-bit)
CentOS Linux release 7.4.1708 (Core)
引用:
Traceback (most recent call last):
File "/scratch2/news_bias/test_pickle.py", line 39, in <module>
img = urllib.request.urlopen(quote(value, "\./_-:"))
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 564, in error
result = self._call_chain(*args)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 756, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 526, in open
response = self._open(req, data)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 544, in _open
'_open', req)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 1318, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1285, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1234, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1026, in _send_output
self.send(msg)
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 964, in send
self.connect()
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1400, in connect
server_hostname=server_hostname)
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 407, in wrap_socket
_context=self, _session=session)
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 814, in __init__
self.do_handshake()
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 1068, in do_handshake
self._sslobj.do_handshake()
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 694, in do_handshake
match_hostname(self.getpeercert(), self.server_hostname)
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 327, in match_hostname
% (hostname, ', '.join(map(repr, dnsnames))))