我的问题与Strip HTML from strings in Python
略有关系我正在寻找一种从文本中删除HTML代码的简单方法。例如:
string = 'foo <SOME_VALID_HTML_TAG> something </SOME_VALID_HTML_TAG> bar'
stripIt(string)
然后会产生foo bar
。
有没有简单的工具可以在Python中实现这一点? HTML代码可以嵌套。
答案 0 :(得分:6)
import lxml.html
import re
def stripIt(s):
doc = lxml.html.fromstring(s) # parse html string
txt = doc.xpath('text()') # ['foo ', ' bar']
txt = ' '.join(txt) # 'foo bar'
return re.sub('\s+', ' ', txt) # 'foo bar'
s = 'foo <SOME_VALID_HTML_TAG> something </SOME_VALID_HTML_TAG> bar'
stripIt(s)
返回
foo bar
答案 1 :(得分:5)
from BeautifulSoup import BeautifulSoup
def removeTags(html, *tags):
soup = BeautifulSoup(html)
for tag in tags:
for tag in soup.findAll(tag):
tag.replaceWith("")
return soup
testhtml = '''
<html>
<head>
<title>Page title</title>
</head>
<body>text here<p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
</body>
</html>'''
print removeTags(testhtml, 'b', 'p')
答案 2 :(得分:3)
您可以使用正则表达式:
def stripIt(s):
txt = re.sub('<[^<]+?>.*?</[^<]+?>', '', s) # Remove html tags
return re.sub('\s+', ' ', txt) # Normalize whitespace
然而,我更喜欢Hugh Bothwell的解决方案,因为它比纯正则表达式更强大。
答案 3 :(得分:2)
试试这个解决方案:
from BeautifulSoup import BeautifulSoup
def stripIt(string, tag):
soup = BeautifulSoup(string)
rmtags = soup.findAll(tag)
for t in rmtags:
string = string.replace(str(t), '')
return string
string = 'foo <p> something </p> bar'
print stripIt(string, 'p')
>>> foo bar
string = 'foo <a>bar</a> baz <a>quux</a>'
print stripIt(string, 'a')
>>> foo baz
编辑:这仅适用于有效嵌套的标记,例如:
string = 'blaz <div>baz <div>quux</div></div>'
print stripIt(string, 'div')
>>> blaz
string = 'blaz <a>baz <a>quux</a></a>'
print stripIt(string, 'a')
>>> blaz <a>baz </a>
答案 4 :(得分:2)
如果有人遇到此问题且已使用jinja模板语言:您可以在模板中使用过滤器striptags
,并在代码中使用函数jinja2.filters.do_striptags()
。
答案 5 :(得分:0)
您可以通过相应地覆盖方法来利用HTMLParser:
from HTMLParser import HTMLParser
class HTMLStripper(HTMLParser):
text_parts = []
depth = 0
def handle_data(self, data):
if self.depth == 0:
self.text_parts.append(data.strip())
def handle_charref(self, ref):
data = unichr(int(ref))
self.handle_data(data)
def handle_starttag(self, tag, attrs):
self.depth += 1
def handle_endtag(self, tag):
if self.depth > 0:
self.depth -= 1
def handle_entityref(self, ref):
try:
data = unichr(name2codepoint[ref])
self.handle_data(data)
except KeyError:
pass
def get_stripped_text(self):
return ' '.join(self.text_parts)
def strip_html_from_text(html):
parser = HTMLStripper()
parser.feed(html)
return parser.get_stripped_text()
def main():
import sys
html = sys.stdin.read()
text = strip_html_from_text(html)
print text
if __name__ == '__main__':
main()