当我尝试使用pandas从维基百科下载数据时出现错误。
pd.read_html('http://simple.wikipedia.org/wiki/List_of_U.S._states')
错误消息显示,
SSLError Traceback (most recent call
last)
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1239 try:
-> 1240 h.request(req.get_method(), req.selector, req.data, headers)
1241 except OSError as err: # timeout error
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/http/client.py in request(self, method, url, body, headers)
1082 """Send a complete request to the server."""
-> 1083 self._send_request(method, url, body, headers)
1084
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/http/client.py in _send_request(self, method, url, body, headers)
1127 body = body.encode('iso-8859-1')
-> 1128 self.endheaders(body)
1129
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/http/client.py in endheaders(self, message_body)
1078 raise CannotSendHeader()
-> 1079 self._send_output(message_body)
1080
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/http/client.py in _send_output(self, message_body)
910
--> 911 self.send(msg)
912 if message_body is not None:
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/http/client.py in send(self, data)
853 if self.auto_open:
--> 854 self.connect()
855 else:
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/http/client.py in connect(self)
1236 self.sock = self._context.wrap_socket(self.sock,
-> 1237 server_hostname=server_hostname)
1238 if not self._context.check_hostname and self._check_hostname:
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
375 server_hostname=server_hostname,
--> 376 _context=self)
377
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/ssl.py in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
746 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 747 self.do_handshake()
748
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/ssl.py in do_handshake(self, block)
982 self.settimeout(None)
--> 983 self._sslobj.do_handshake()
984 finally:
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/ssl.py in do_handshake(self)
627 """Start the SSL/TLS handshake."""
--> 628 self._sslobj.do_handshake()
629 if self.context.check_hostname:
SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:646)
During handling of the above exception, another exception occurred:
URLError Traceback (most recent call last)
<ipython-input-51-330bd889a78f> in <module>()
----> 1 fiddy_states = pd.read_html('http://simple.wikipedia.org/wiki/List_of_U.S._states')
2 print(fiddy_states)
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/site-packages/pandas/io/html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, tupleize_cols, thousands, encoding)
864 _validate_header_arg(header)
865 return _parse(flavor, io, match, header, index_col, skiprows,
--> 866 parse_dates, tupleize_cols, thousands, attrs, encoding)
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/site-packages/pandas/io/html.py in _parse(flavor, io, match, header, index_col, skiprows, parse_dates, tupleize_cols, thousands, attrs, encoding)
726 break
727 else:
--> 728 raise_with_traceback(retained)
729
730 ret = []
/Users/Soma/.pyenv/versions/3.5.0/lib/python3.5/site-packages/pandas/compat/__init__.py in raise_with_traceback(exc, traceback)
746 if traceback == Ellipsis:
747 _, _, traceback = sys.exc_info()
--> 748 raise exc.with_traceback(traceback)
749 else:
750 # this version of raise is a syntax error in Python 3
URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:646)>
我不知道为什么会这样。
答案 0 :(得分:2)
我在Linux上遇到的问题与Linux网站上的问题相同 - 在Windows上,相同的代码解析了网站上的表格。花了一些时间在Linux上比较和更新库版本没有结果,我在使用read_html之前添加了一些额外的代码来处理SSL证书:
> import urllib3,certifi
>
> #Force certificate check and use certifi to handle the certificate.
> https = urllib3.PoolManager( cert_reqs='CERT_REQUIRED',
> ca_certs=certifi.where(),)
>
> url = https.urlopen('GET','https://yoursecureproblematicwebsite.com')
>
> #Then you parse the html as usual
> foo = pd.read_html(url.data)
还要确保您拥有最新版本的certifi:
>python -m pip install certifi --update
这很可能不是最有效的方式,但我希望有所帮助。
丰齐