我试图从美国人口普查网站获取数据,使用Python 2.7的美味汤。这是我使用的代码:
import urllib
from bs4 import BeautifulSoup
url = "https://www.census.gov/quickfacts/table/PST045215/01"
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
然而,这是我得到的错误:
IOError Traceback (most recent call last)
<ipython-input-5-47941f5ea96a> in <module>()
59
60 url = "https://www.census.gov/quickfacts/table/PST045215/01"
---> 61 html = urllib.urlopen(url).read()
62 soup = BeautifulSoup(html)
63
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.pyc in urlopen(url, data, proxies, context)
85 opener = _urlopener
86 if data is None:
---> 87 return opener.open(url)
88 else:
89 return opener.open(url, data)
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.pyc in open(self, fullurl, data)
211 try:
212 if data is None:
--> 213 return getattr(self, name)(url)
214 else:
215 return getattr(self, name)(url, data)
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.pyc in open_https(self, url, data)
441 if realhost: h.putheader('Host', realhost)
442 for args in self.addheaders: h.putheader(*args)
--> 443 h.endheaders(data)
444 errcode, errmsg, headers = h.getreply()
445 fp = h.getfile()
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in endheaders(self, message_body)
1051 else:
1052 raise CannotSendHeader()
-> 1053 self._send_output(message_body)
1054
1055 def request(self, method, url, body=None, headers={}):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in _send_output(self, message_body)
895 msg += message_body
896 message_body = None
--> 897 self.send(msg)
898 if message_body is not None:
899 #message_body was not a string (i.e. it is a file) and
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in send(self, data)
857 if self.sock is None:
858 if self.auto_open:
--> 859 self.connect()
860 else:
861 raise NotConnected()
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in connect(self)
1276
1277 self.sock = self._context.wrap_socket(self.sock,
-> 1278 server_hostname=server_hostname)
1279
1280 __all__.append("HTTPSConnection")
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
351 suppress_ragged_eofs=suppress_ragged_eofs,
352 server_hostname=server_hostname,
--> 353 _context=self)
354
355 def set_npn_protocols(self, npn_protocols):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
599 # non-blocking
600 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 601 self.do_handshake()
602
603 except (OSError, ValueError):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ssl.pyc in do_handshake(self, block)
828 if timeout == 0.0 and block:
829 self.settimeout(None)
--> 830 self._sslobj.do_handshake()
831 finally:
832 self.settimeout(timeout)
IOError: [Errno socket error] [SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:590)
答案 0 :(得分:2)
此问题的一种解决方法是切换到requests
:
import requests
from bs4 import BeautifulSoup
url = "https://www.census.gov/quickfacts/table/PST045215/01"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.title.get_text())
打印:
Alabama QuickFacts from the US Census Bureau
请注意,这可能还需要安装requests\[security\]
package:
pip install requests[security]