如何浏览网站中的其他网页以提取数据而无需再次进行身份验证?使用python和urllib2
请参阅下面的代码,我打开第一页http://xx.xx.xx.xx:8080/status并在验证后获得我需要的内容,然后尝试打开第二页http://xx.xx.xx.xx:8080/uistatus.html,但跳转到异常条款。
意外错误HTTP错误401:未经授权
代码:
try:
pattern = r'\s*Current\s+stream\s+number:\s*(\d+)'
pattern2 = r'\s*Reconnects:\s*(\d+)'
SERVER = 'http://xx.xx.xx.xx:8080/status'
authinfo = urllib2.HTTPPasswordMgrWithDefaultRealm()
authinfo.add_password(None, SERVER, 'xxxxxx', 'xxxxxxx')
page = 'http://xx.xx.xx.xx:8080/status'
handler = urllib2.HTTPBasicAuthHandler(authinfo)
myopener = urllib2.build_opener(handler)
opened = urllib2.install_opener(myopener)
output = urllib2.urlopen(page)
#print output.read()
soup = BeautifulSoup(output.read(), "lxml")
#print(soup)
paragraphs = soup.findAll('p')
data = []
for para in paragraphs:
found = re.finditer(pattern, para.text, re.IGNORECASE);
data.extend([x.group(1) for x in found])
#print data
print "exstreamer 1 status: ", int(data[0])
if int(data[0]) == 1:
mesg = "Centerpoint exstreamer connected to main streaming host"
centerpoint_online = "Online"
centerpoint_connection = "Main"
elif int(data[0]) == 2 or int(data[0]) == 3:
mesg = "Centerpoint exstreamer connected to local qkradio instreamer"
print 'alert sent', mesg
with open("/var/www/html/status.log", "a") as myfile:
myfile.write(time.strftime("%Y-%m-%d %H:%M")+ "\t Centerpoint exstreamer connected to local qkradio instreamer\n")
centerpoint_connection = "Backup"
system_ok = "Offline"
data = []
for para in paragraphs:
found = re.finditer(pattern2, para.text, re.IGNORECASE);
data.extend([x.group(1) for x in found])
centerpoint_reconnect_number_old = centerpoint_reconnect_number
centerpoint_reconnect_number = int(data[0])
print "Centerpoint number of reconnects: ", centerpoint_reconnect_number
if not centerpoint_reconnect_number == centerpoint_reconnect_number_old:
centerpoint_stream_stable = "Disconnected/ Reconnected to Stream"
mesg = "Centerpoint exstreamer disconnect/reconnect, possible buffering issues"
print 'alert sent', mesg
with open("/var/www/html/status.log", "a") as myfile:
myfile.write(time.strftime("%Y-%m-%d %H:%M")+ "\t Centerpoint exstreamer disconnect/reconnect, possible buffering issues\n")
else:
centerpoint_stream_stable = "system ok"
page = 'http://xx.xx.xx.xx:8080/uistatus.html'
output = urllib2.urlopen(page)
htmlparser = etree.HTMLParser()
tree = etree.parse(output, htmlparser)
#print tree.xpath("/html/body/table/tr[3]/th[2]/font/text()")
print tree.xpath("//th/font[@color]/text()")
centerpoint_stream_status = tree.xpath("//th/font[@color]/text()")
if centerpoint_stream_status is "['IDLE']":
mesg = "Centerpoint exstreamer source IDLE"
print 'alert sent', mesg
with open("/var/www/html/status.log", "a") as myfile:
myfile.write(time.strftime("%Y-%m-%d %H:%M")+ "\t Centerpoint exstreamer source IDLE\n")
except urllib2.URLError:
print "Internet dropped, or error"
mesg = "Centerpoint exstreamer unreachable"
print 'alert sent', mesg
i_centerpoint_online = centerpoint_online + 1
if centerpoint_online == 3:
centerpoint_online = 0
with open("/var/www/html/status.log", "a") as myfile:
myfile.write(time.strftime("%Y-%m-%d %H:%M")+ "\t Centerpoint exstreamer unreachable\n")
centerpoint_online = "Offline"
system_ok = "Offline"
except Exception, err:
print "Unexpected error", err
centerpoint_online = "Offline"
system_ok = "Offline"