我有这段代码:
evil = "<script>malignus script</script><b>bold text</b><i>italic text</i>"
cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'],
page_structure=True)
print cleaner.clean_html(evil)
我希望得到这个:
<b>bold text</b>italic text
但相反,我得到了这个:
<div><b>bold text</b>italic text</div>
是否有删除div
标记包装的属性?
答案 0 :(得分:13)
lxml期望你的html有一个树形结构,即一个根节点。如果它没有,则添加它。
答案 1 :(得分:0)
Cleaner始终将结果包装在一个元素中。一个好的解决方案是手动解析HTML并将结果文档对象发送给Cleaner,然后结果也是文档对象,您可以使用text_content从根目录提取文本。
from lxml.html import document_fromstring
from lxml.html.clean import Cleaner
evil = "<script>malignus script</script><b>bold text</b><i>italic
text</i>"
doc = document_fromstring(evil)
cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'],
page_structure=True)
print cleaner.clean_html(doc).text_content()
这也可以作为one liner
完成答案 2 :(得分:0)
这是我想出的。
import lxml
from lxml.html.clean import Cleaner
def clean_html(html):
if html:
is_wrap_in_div = check_is_wrap_in_div(html)
cleaner = Cleaner()
html = cleaner.clean_html(html)
if not is_wrap_in_div:
html = remove_root_div(html)
return html
def check_is_wrap_in_div(html):
is_wrapped = False
try:
tree = lxml.etree.fromstring(html)
if tree.tag == 'div':
return True
except lxml.etree.XMLSyntaxError:
pass
return is_wrapped
def remove_root_div(html):
root_div_regex = r'^(\s*<div[\s\S]*?>)([\s\S]*)(<\/div>[\s\S]*?)$'
return re.sub(root_div_regex, r'\2', html)
# use it as
cleaned_html = clean_html(evil_html)
并进行单元测试
class TestBase(unittest.TestCase):
def test_check_is_wrap_in_div(self):
with self.subTest('test html wrap in div'):
self.assertTrue(
utils.check_is_wrap_in_div('<div></div>'),
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div>
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div class="test" style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;"
>
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;">
<div>
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
<div>
<p>Hi</p>
</div>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;">
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
""")
)
with self.subTest('test html not wrap in div'):
html_list = [
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
"""
<p>HELLO</p>
<p>TEST</p>
""",
"""
<section>
<div>
<p>hello</p>
</div>
</section>
""",
'<p>HELLO</p><p>TEST</p>',
"""
<body>
<div class="HELO">
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;"
>
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
"""
<p>Hello</p>
<p>World</p>
<div class="testing">
Hello
</div>
""",
"""
<div>
<p>Hello</p>
<p>World</p>
<p>Hello</p>
<p>World</p>
</div>
<div> </div>
""",
"""
<div>
<p>Hello</p>
<p>World</p>
<p>Hello</p>
<p>World</p>
</div>
<div> </div>
""",
"""
<div>
<div>
<p>Hello</p>
<p>World</p>
<p>Hello</p>
<p>World</p>
</div>
</div>
<span>
<div> </div>
</span>
""",
]
for html in html_list:
self.assertFalse(
utils.check_is_wrap_in_div(html),
)
def test_remove_root_div(self):
with self.subTest('test remove root html'):
self.assertEqual(
utils.remove_root_div('<div></div>'),
'',
)
self.assertEqual(
utils.remove_root_div(
"""
<div>
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div class="test" style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div
class="test"
style="color: blue;"
>
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div
class="test"
style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div
class="test"
style="color: blue;">
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
"""
).strip(),
"""
<div>
<p>Hello</p>
<p>Test</p>
</div>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""<div
class="test"
style="color: blue;">
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
"""
).strip(),
"""
<div>
<p>Hello</p>
<p>Test</p>
</div>
""".strip(),
)
with self.subTest('test not root html'):
html_list = [
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
"""
<p>HELLO</p>
<p>TEST</p>
""",
"""
<section>
<div>
<p>hello</p>
</div>
</section>
""",
'<p>HELLO</p><p>TEST</p>',
"""
<body>
<div class="HELO">
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;"
>
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
]
for html in html_list:
self.assertEqual(
utils.remove_root_div(html),
html,
)