"""
download html
"""
def download_html(url):
html = ""
try:
time.sleep(random.randint(1, 3))
req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) \
AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip',
'Connection':'close',
'Referer':None
}
req_timeout = 5
req = urllib2.Request(url, None, req_header)
response = urllib2.urlopen(req, None, req_timeout)
html = response.read()
html = html.replace("&", "&")
#html = html.replace("xmlns", "id")
#html = html.replace("id:fb", "class")
return html
except:
return ""
"""
parse html use xpath element
"""
def parse_xpath_value(xpath_element, html):
f = StringIO.StringIO(html)
tree = etree.parse(f)
r = tree.xpath(xpath_element)
print r
def main():
url = "http://academic.research.microsoft.com/RankList?entitytype=4&topDomainID=1&subDomainID=0&last=0&start=1&end=100"
html = download_html(url)
journal_xpath = "//tbody/tr/td/a/text()"
journal_list = parse_xpath_value(journal_xpath, html)
我想在代码中解析网址中的日记。 当我读取url的代码时,我注意到html元素中的xmlns和xmlns:fb。 如果我从html中删除xmlns和xmlns:fb,我的代码运行良好。