我正在尝试将一个site读入BeautifulSoup,到目前为止我尝试打开安全连接的所有尝试都失败了(我最初试图用Python 3来解决这个问题,但正如你所看到的那样这也充满了危险)。这是我最近的尝试涉及urllib2(我还没有找到urllib3示例或者已经成功将此代码更新为urllib3):
import httplib, ssl, urllib2, socket
from bs4 import BeautifulSoup
class HTTPSConnectionV3(httplib.HTTPSConnection):
def __init__(self, *args, **kwargs):
httplib.HTTPSConnection.__init__(self, *args, **kwargs)
def connect(self):
sock = socket.create_connection((self.host, self.port), self.timeout)
if self._tunnel_host:
self.sock = sock
self._tunnel()
try:
self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
except ssl.SSLError, e:
print("Trying SSLv3.")
self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
class HTTPSHandlerV3(urllib2.HTTPSHandler):
def https_open(self, req):
return self.do_open(HTTPSConnectionV3, req)
# install opener
urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3()))
r = urllib2.urlopen('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm')
s= r.read()
soup = BeautifulSoup(s)
for t in soup.findAll('h2'):
print(t)
当我运行此代码时,我得到以下堆栈跟踪:
Traceback (most recent call last):
File "test.py", line 27, in <module>
r = urllib2.urlopen('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm')
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "test.py", line 22, in https_open
return self.do_open(HTTPSConnectionV3, req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1187, in do_open
r = h.getresponse(buffering=True)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1045, in getresponse
response.begin()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 409, in begin
version, status, reason = self._read_status()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 373, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''
为了让事情变得更加棘手,我在查看网址时看到的是:
$ curl -v https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm
* Adding handle: conn: 0x7ff1f1804000
* Adding handle: send: 0
* Adding handle: recv: 0
* Curl_addHandleToPipeline: length: 1
* - Conn 0 (0x7ff1f1804000) send_pipe: 1, recv_pipe: 0
* About to connect() to bw6.clpccd.cc.ca.us port 443 (#0)
* Trying 205.155.225.145...
* Connected to bw6.clpccd.cc.ca.us (205.155.225.145) port 443 (#0)
* Server aborted the SSL handshake
* Closing connection 0
curl: (35) Server aborted the SSL handshake
如果我强制使用SSLv3,我会得到预期的输出:
$ curl -v -3 https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm
* Adding handle: conn: 0x7ffa50804000
* Adding handle: send: 0
* Adding handle: recv: 0
* Curl_addHandleToPipeline: length: 1
* - Conn 0 (0x7ffa50804000) send_pipe: 1, recv_pipe: 0
* About to connect() to bw6.clpccd.cc.ca.us port 443 (#0)
* Trying 205.155.225.145...
* Connected to bw6.clpccd.cc.ca.us (205.155.225.145) port 443 (#0)
* SSL 3.0 connection using SSL_RSA_WITH_RC4_128_SHA
* Server certificate: bw6.clpccd.cc.ca.us
* Server certificate: VeriSign Class 3 Secure Server CA - G3
* Server certificate: VeriSign Class 3 Public Primary Certification Authority - G5
* Server certificate: Class 3 Public Primary Certification Authority
> GET /clpccd/2014/02/sched_l.htm HTTP/1.1
> User-Agent: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
> Host: bw6.clpccd.cc.ca.us
> Accept: */*
> Referer:
>
< HTTP/1.1 200 OK
< Date: Thu, 23 Oct 2014 00:00:11 GMT
* Server Oracle-Application-Server-10g/10.1.3.4.0 Oracle-HTTP-Server is not blacklisted
< Server: Oracle-Application-Server-10g/10.1.3.4.0 Oracle-HTTP-Server
< Last-Modified: Wed, 22 Oct 2014 20:05:42 GMT
< ETag: "422e-1e72-54480e16"
< Accept-Ranges: bytes
< Content-Length: 7794
< Connection: close
< Content-Type: text/html
<
<html....>
* Closing connection 0
如果我之前的尝试帮助了这里的任何人,那么当我使用Python 3的Requests库(遵循他们的Example: Specific SSL Version¶文档)时,我的方法是什么样的
import ssl
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
from bs4 import BeautifulSoup
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
class Ssl3HttpAdapter(HTTPAdapter):
""""Transport adapter" that allows us to use SSLv3."""
def init_poolmanager(self, connections, maxsize, block=True):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_SSLv3)
s = requests.session()
s.mount('https://bw6.clpccd.cc.ca.us', Ssl3HttpAdapter())
r = s.get('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm', headers=headers)
soup = BeautifulSoup(r.text)
for t in soup.findAll('h2'):
print(t)
它产生了一个类似的(但更神秘的)堆栈跟踪:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 331, in _make_request
httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 1172, in getresponse
response.begin()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 351, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 321, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine: ''
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 362, in send
timeout=timeout
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 559, in urlopen
_pool=self, _stacktrace=stacktrace)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/util/retry.py", line 245, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/packages/six.py", line 309, in reraise
raise value.with_traceback(tb)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 1172, in getresponse
response.begin()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 351, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 321, in _read_status
raise BadStatusLine(line)
requests.packages.urllib3.exceptions.ProtocolError: ('Connection aborted.', BadStatusLine("''",))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./test.py", line 23, in <module>
r = s.get('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm', headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 469, in get
return self.request('GET', url, **kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 457, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 569, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 407, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))
答案 0 :(得分:0)
为了取得进步,我编写了这段代码,虽然我不满意这是Pythonic的答案:
import os
import subprocess
from bs4 import BeautifulSoup
FNULL = open(os.devnull, 'w')
html = subprocess.Popen(["curl", "-3", "https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm", stdout=subprocess.PIPE, stderr=FNULL).communicate()[0]
soup = BeautifulSoup(html)
for t in soup.findAll('h2'):
print(t.text)
基本上我只是调用curl -3 <url>
并捕获输出。