IOError:使用BeautifulSoup的[Errno套接字错误]

时间:2016-10-03 15:20:00

标签: python beautifulsoup

我试图从美国人口普查网站获取数据,使用Python 2.7的美味汤。这是我使用的代码:

import urllib
from bs4 import BeautifulSoup

url = "https://www.census.gov/quickfacts/table/PST045215/01"
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)

然而,这是我得到的错误:

IOError                                   Traceback (most recent call last)
<ipython-input-5-47941f5ea96a> in <module>()
     59 
     60 url = "https://www.census.gov/quickfacts/table/PST045215/01"
---> 61 html = urllib.urlopen(url).read()
     62 soup = BeautifulSoup(html)
     63 

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.pyc in urlopen(url, data, proxies, context)
     85         opener = _urlopener
     86     if data is None:
---> 87         return opener.open(url)
     88     else:
     89         return opener.open(url, data)

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.pyc in open(self, fullurl, data)
    211         try:
    212             if data is None:
--> 213                 return getattr(self, name)(url)
    214             else:
    215                 return getattr(self, name)(url, data)

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.pyc in open_https(self, url, data)
    441             if realhost: h.putheader('Host', realhost)
    442             for args in self.addheaders: h.putheader(*args)
--> 443             h.endheaders(data)
    444             errcode, errmsg, headers = h.getreply()
    445             fp = h.getfile()

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in endheaders(self, message_body)
   1051         else:
   1052             raise CannotSendHeader()
-> 1053         self._send_output(message_body)
   1054 
   1055     def request(self, method, url, body=None, headers={}):

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in _send_output(self, message_body)
    895             msg += message_body
    896             message_body = None
--> 897         self.send(msg)
    898         if message_body is not None:
    899             #message_body was not a string (i.e. it is a file) and

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in send(self, data)
    857         if self.sock is None:
    858             if self.auto_open:
--> 859                 self.connect()
    860             else:
    861                 raise NotConnected()

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in connect(self)
   1276 
   1277             self.sock = self._context.wrap_socket(self.sock,
-> 1278                                                   server_hostname=server_hostname)
   1279 
   1280     __all__.append("HTTPSConnection")

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    351                          suppress_ragged_eofs=suppress_ragged_eofs,
    352                          server_hostname=server_hostname,
--> 353                          _context=self)
    354 
    355     def set_npn_protocols(self, npn_protocols):

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
    599                         # non-blocking
    600                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 601                     self.do_handshake()
    602 
    603             except (OSError, ValueError):

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ssl.pyc in do_handshake(self, block)
    828             if timeout == 0.0 and block:
    829                 self.settimeout(None)
--> 830             self._sslobj.do_handshake()
    831         finally:
    832             self.settimeout(timeout)

IOError: [Errno socket error] [SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:590)

我已从两个Stack Overflow源(例如thisthis)中寻找解决方案,但它们无法解决问题。

1 个答案:

答案 0 :(得分:2)

此问题的一种解决方法是切换到requests

import requests
from bs4 import BeautifulSoup

url = "https://www.census.gov/quickfacts/table/PST045215/01"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
print(soup.title.get_text())

打印:

Alabama QuickFacts from the US Census Bureau

请注意,这可能还需要安装requests\[security\] package

pip install requests[security]