Question

"""
download html
"""
def download_html(url):
    html = ""
    try:
        time.sleep(random.randint(1, 3))
        req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) \
        AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept':'text/html;q=0.9,*/*;q=0.8',
        'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding':'gzip',
        'Connection':'close',
        'Referer':None
        }
        req_timeout = 5
        req = urllib2.Request(url, None, req_header)
        response = urllib2.urlopen(req, None, req_timeout)
        html = response.read()
        html = html.replace("&", "&amp;")
        #html = html.replace("xmlns", "id")
        #html = html.replace("id:fb", "class")
        return html
    except:
        return ""


"""
parse html use xpath element
"""
def parse_xpath_value(xpath_element, html):
    f = StringIO.StringIO(html)
    tree = etree.parse(f)
    r = tree.xpath(xpath_element)
    print r


def main():
    url = "http://academic.research.microsoft.com/RankList?entitytype=4&topDomainID=1&subDomainID=0&last=0&start=1&end=100"
    html = download_html(url)
    journal_xpath = "//tbody/tr/td/a/text()"
    journal_list = parse_xpath_value(journal_xpath, html)

我想在代码中解析网址中的日记。当我读取url的代码时，我注意到html元素中的xmlns和xmlns：fb。如果我从html中删除xmlns和xmlns：fb，我的代码运行良好。

为什么不能解析 - 当＆＃34; xmlns：fb＆＃34;时使用lxml？在HTML中

0 个答案: