我正在玩Python并试图解析网页,以便在亚马逊消防电视上自动完成我的足球观看过程。
我制作了以下代码来按网址阅读HTML页面:
from httplib import BadStatusLine
import urllib2
import logging
htmlWorker = html_worker.HtmlWorkerLiveFootball()
htmlWorker.get_list_of_matches(htmlWorker.URL)
class HtmlWorkerLiveFootball:
URL = 'http://livefootball.ws/'
def get_list_of_matches(self, url):
opener = urllib2.OpenerDirector()
for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler, HTTPMethodFallback, HEADRedirectHandler,
urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
opener.add_handler(handler())
opener.addheaders = [('User-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36'
)]
urllib2.install_opener(opener)
try:
logging.warning("request = %s" % opener.addheaders)
page = urllib2.urlopen(url)
logging.warning("result = %s" % page.read())
except urllib2.HTTPError, error:
logging.error("error code = %d" % error.code)
except BadStatusLine:
logging.error("could not fetch %s" % url)
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
logging.warning("redirect_request = %d" % code)
newurl = newurl.replace(' ', '%20')
logging.warning("new url = %s" % newurl)
logging.warning("headers = %s" % headers)
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
logging.debug("newheaders = %s" % newheaders)
request = HeadRequest(newurl, headers=newheaders, origin_req_host=req.get_origin_req_host(),
unverifiable=True)
request.add_header('User-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36')
request.add_header('Cookie', headers.dict['set-cookie'])
request.add_header('Host', "livefootball.ws")
request.add_header('Accept-Encoding', "gzip,deflate,sdch")
request.add_header('Accept', "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
request.add_header('Cache-Control', "max-age=0")
request.add_header('Accept-Language', "en-US,en;q=0.8,ru;q=0.6")
logging.warning("request = %s" % request.headers)
return request
else:
raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
class HTTPMethodFallback(urllib2.BaseHandler):
def http_error_405(self, req, fp, code, msg, headers):
logging.warning("http_error_405. Headers = %s" % headers)
fp.read()
fp.close()
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return self.parent.open(urllib2.Request(req.get_full_url(),
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True))
它适用于互联网上的大量网站,但不幸的是看起来像我需要尝试避免DDOS攻击的网站与一些不熟悉的机制(重定向+一些饼干的东西)。我试图模仿浏览器行为,但最后有空字符串。
执行此代码后,我在此处记录:
WARNING:root:request = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36')]
WARNING:root:redirect_request = 307
WARNING:root:new url = http://livefootball.ws/?dos=1
WARNING:root:headers = Server: nginx
Date: Sun, 15 Jun 2014 14:11:03 GMT
Content-Type: text/html
Content-Length: 180
Connection: close
Set-Cookie: antid=6abeccafd9ac44951b4acc7f642649b7; path=/
Location: http://livefootball.ws/?dos=1
WARNING:root:request = {'Accept-language': 'en-US,en;q=0.8,ru;q=0.6', 'Accept-encoding': 'gzip,deflate,sdch', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36', 'Host': 'livefootball.ws', 'Cookie': 'antid=6abeccafd9ac44951b4acc7f642649b7; path=/', 'Cache-control': 'max-age=0'}
WARNING:root:result =
如何使用python阅读此页面?感谢。
答案 0 :(得分:1)
如果您想按网址阅读HTML网页,可以使用requests库代替urllib2
。它很容易使用:
import requests
session = requests.Session()
index_url = 'http://livefootball.ws/'
index_request = session.get(index_url)
#change encoding of the response
index_request.encoding = 'CP1251'
#print page content
print index_request.text