HTML代码
<a href="1.co">1<a href="2.co">2</a></a>
我尝试递归地调用BS以获得拳头标签的“内容”,但BS失败
if hasattr(markup, 'read'): # It's a file-type object.
> markup = markup.read()
E TypeError: 'NoneType' object is not callable
Python代码
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
def parse(text):
soup = BeautifulSoup(text, parse_only=SoupStrainer(['a']), features="html.parser")
for tag in soup:
if tag.name == "a" and tag.has_attr("href"):
print(tag["href"])
if hasattr(tag, "contents"):
for text in tag.contents:
parse(text)
if __name__ == '__main__':
parse("""<a href="2.co">2<a href="3.co">3</a></a>""")
答案 0 :(得分:3)
只需find_all('a')
from bs4 import BeautifulSoup
data='''<a href="1.co">1<a href="2.co">2</a></a>'''
soup=BeautifulSoup(data,'html.parser')
for item in soup.find_all('a',href=True):
print(item['href'])
答案 1 :(得分:1)
调用str()可以解决问题
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
def parse(text):
soup = BeautifulSoup(text, parse_only=SoupStrainer(['a']), features="html.parser")
for tag in soup:
if tag.name == "a" and tag.has_attr("href"):
print(tag["href"])
if hasattr(tag, "contents"):
for text in tag.contents:
parse(str(text)) # This is where the bug was
if __name__ == '__main__':
parse("""<a href="2.co">2<a href="3.co">3</a></a>""")
答案 2 :(得分:1)
如果要使用所有<a>
标签,请按照建议使用.find_all('a')
。但是,如果您只希望嵌套<a>
标签,则可以执行当前操作,然后在每个标签中查找具有<a>
标签的子标签:
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
text = '''<a href="1.co">1<a href="2.co">2</a></a>'''
soup = BeautifulSoup(text, parse_only=SoupStrainer(['a']), features="html.parser")
for tag in soup:
if tag.name == "a" and tag.has_attr("href"):
children = tag.findChildren('a')
for child in children:
print(child["href"])