我想在LI标签及其属性之间找到span标签。试着用漂亮的肥皂,但没有运气。我的代码详情。是否有人指出我的方法
在这段代码中,我的getId函数应该返回id =“0_False-2”
任何人都知道正确的方法吗?
from BeautifulSoup import BeautifulSoup as bs
import re
html = '<ul>\
<li class="line"> </li>\
<li class="folder-open-last" id="0">\
<img style="float: left;" class="trigger" src="/media/images/spacer.gif" border="0">\
<span class="text" id="0_False">NOC</span><ul style="display: block;"><li class="line"> </li><li class="doc" id="1"><span class="active text" id="0_False-1">PNQAIPMS1</span></li><li class="line"> </li><li class="doc-last" id="2"><span class="text" id="0_False-2">PNQAIPMS2</span></li><li class="line-last"></li></ul></li><li class="line-last"></li>\
</ul>'
def getId(html, txt):
soup = bs(html)
soup.findAll('ul',recursive=False)
head = soup.contents[0]
temp = head
elements = {}
while True:
# It temp is None that means no HTML tags are available
if temp == None:
break
#print temp
if re.search('li', str( temp)) != None:
attr = str(temp.attrs).encode('ascii','ignore')
attr = attr.replace(' ', '')
attr = attr.replace('[', '')
attr = attr.replace(']', '')
attr = attr.replace(')', '')
attr = attr.replace('(', '')
attr = attr.replace('u\'', '')
attr = attr.replace('\'', '')
attr = attr.split(',')
span = str(temp.text)
if span == txt:
return attr[3]
temp = temp.next
else:
temp = temp.next
id = getId(html,"PNQAIPMS2")
print "ID = " + id
答案 0 :(得分:0)
我相信有人可以告诉你BS方式,但这是我的方法。只是简单的旧Python字符串操作。
html = '<ul>\
<li class="line"> </li>\
<li class="folder-open-last" id="0">\
<img style="float: left;" class="trigger" src="/media/images/spacer.gif" border="0">\
<span class="text" id="0_False">NOC</span><ul style="display: block;"><li class="line"> </li><li class="doc" id="1"><span class="active text" id="0_False-1">PNQAIPMS1</span></li><li class="line"> </li><li class="doc-last" id="2"><span class="text" id="0_False-2">PNQAIPMS2</span></li><li class="line-last"></li></ul></li><li class="line-last"></li>\
</ul>'
def getId( html, txt):
for LI in html.split("</li>"):
if "span" in LI:
for CL in LI.split("span"):
if "class" in CL and "id" in CL and "text" in CL and txt in CL:
return CL.split("id=")[-1].split('">')[0].replace('"',"")
print "id for PNQAIPMS2: " , getId(html,"PNQAIPMS2")
print "id for NOC: ",getId(html, "NOC")
print "id for PNQAIPMS1: ",getId(html, "PNQAIPMS1")
输出
$ ./python.py
id for PNQAIPMS2: 0_False-2
id for NOC: 0_False
id for PNQAIPMS1: 0_False-1