html="""<div class="practice-location">
<strong>Primary Location of Practice</strong><br/>
Suite 100<br/>2010 Eglinton Avenue West<br/>Toronto ON  M6E 2K3<br/><strong>
</div>"""
我提取地址时遇到问题。
我希望字符串看起来像
mystr=Suite 100,2010 Eglinton Avenue West, Toronto ON  M6E 2K3
我的代码:
dt = soup.find(class_ ={"practice-location"})
print dt
ele=dt.find_all('strong')
print ele
add=[]
for x in ele.find_next_siblings(text=True):
add.append(x.text)
location=','.join(add)
print location
答案 0 :(得分:1)
使用.extract()
删除标签,并使用.replace_with
替换标签
from bs4 import BeautifulSoup
html="""<div class="practice-location">
<strong>Primary Location of Practice</strong><br/>
Suite 100<br/>2010 Eglinton Avenue West<br/>Toronto ON  M6E 2K3<br/><strong>
</div>"""
soup = BeautifulSoup(html, 'html.parser')
dt = soup.find(class_ ={"practice-location"})
# remove "strong" here
dt.strong.extract()
for br in dt.select('br'):
br.replace_with(', ')
print(dt.text.strip().strip(',').strip())
# Suite 100, 2010 Eglinton Avenue West, Toronto ON  M6E 2K3
大约3倍strip()
,将<br>
替换为,
后,它将产生字符串
,
Suite 100, 2010 Eglinton Avenue West, Toronto ON  M6E 2K3,
第一个.strip()
删除空格和换行符,第二个删除逗号,然后第三个再次替换空格和换行符。
答案 1 :(得分:0)
您可以只执行.text或.extract,但我想您希望它们用','分隔
这会做到的。
from bs4 import BeautifulSoup, Tag
def split_at_br(text):
string = ''
for x in text:
if isinstance(x, str) and '\n' not in x:
string += x
if isinstance(x, str) and '\n' in x:
x = x.split('\n')
x_temp = []
for ele in x:
ele = ele.strip()
x_temp.append(ele)
x = ' '.join(x_temp)
x = x.strip()
string += x
if isinstance(x, Tag):
if x.name != 'br':
x = x.text
string += x
else:
x = ','
string += x
string = string[:-2].strip()
return string
给出输出:
html="""<div class="practice-location">
<strong>Primary Location of Practice</strong><br/>
Suite 100<br/>2010 Eglinton Avenue West<br/>Toronto ON  M6E 2K3<br/><strong>
</div>"""
soup = BeautifulSoup(html, 'html.parser')
text = soup.select('div.practice-location')
text = text[0].contents
mystr = split_at_br(text)
然后
In [1]: print (mystr)
Primary Location of Practice,Suite 100,2010 Eglinton Avenue West,Toronto ON  M6E 2K3