我对urllib2
有疑问。
我尝试创建python生成器,使用urllib2.open()
方法创建连接,然后想要抓住响应,但在第四次调用中我有例外。
代码示例:
class HtmlParser(object):
def __init__(self, search_text,page_count=1):
self.GoogleSearch = GoogleSearch(search_text,page_count)
self.GoogleSearch.search()
# print self.GoogleSearch
def parse_page(self, link):
response = opener.open(link).read()
page = BeautifulSoup(response, 'html.parser')
res_page = self.clenup(page)
def clenup(self, page):
unnes_elem = Config.get('unnecessary','elements')
unnes_attr = Config.get('unnecessary','attributes')
comments = page.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
[s.extract() for s in page('script')]
[s.extract() for s in page('link')]
return page
def start_parsing(self):
for link in self.GoogleSearch.web_links:
self.parse_page(link)
从这样开始:
pars = HtmlParser(search_text='hello')
pars.start_parsing()
例外:
File "parser.py", line 52, in <module>
pars.start_parsing()
File "parser.py", line 48, in start_parsing
self.parse_page(link)
File "parser.py", line 26, in parse_page
response = opener.open(link)
File "/usr/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1187, in do_open
r = h.getresponse(buffering=True)
File "/usr/lib/python2.7/httplib.py", line 1051, in getresponse
response.begin()
File "/usr/lib/python2.7/httplib.py", line 415, in begin
version, status, reason = self._read_status()
File "/usr/lib/python2.7/httplib.py", line 379, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''