如何使用Cleaner,lxml.html而不返回div标签?

时间:2014-01-29 02:28:28

标签: python lxml.html

我有这段代码:

evil = "<script>malignus script</script><b>bold text</b><i>italic text</i>"
cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'],
                  page_structure=True)
print cleaner.clean_html(evil)

我希望得到这个:

<b>bold text</b>italic text

但相反,我得到了这个:

<div><b>bold text</b>italic text</div>

是否有删除div标记包装的属性?

3 个答案:

答案 0 :(得分:13)

lxml期望你的html有一个树形结构,即一个根节点。如果它没有,则添加它。

答案 1 :(得分:0)

Cleaner始终将结果包装在一个元素中。一个好的解决方案是手动解析HTML并将结果文档对象发送给Cleaner,然后结果也是文档对象,您可以使用text_content从根目录提取文本。

from lxml.html import document_fromstring
from lxml.html.clean import Cleaner
evil = "<script>malignus script</script><b>bold text</b><i>italic 
text</i>"
doc = document_fromstring(evil)
cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'],
              page_structure=True)
print cleaner.clean_html(doc).text_content()

这也可以作为one liner

完成

答案 2 :(得分:0)

这是我想出的。

import lxml
from lxml.html.clean import Cleaner

def clean_html(html):
    if html:
        is_wrap_in_div = check_is_wrap_in_div(html)

        cleaner = Cleaner()
        html = cleaner.clean_html(html)

        if not is_wrap_in_div:
            html = remove_root_div(html)

    return html


def check_is_wrap_in_div(html):
    is_wrapped = False
    try:
        tree = lxml.etree.fromstring(html)
        if tree.tag == 'div':
            return True
    except lxml.etree.XMLSyntaxError:
        pass

    return is_wrapped


def remove_root_div(html):
    root_div_regex = r'^(\s*<div[\s\S]*?>)([\s\S]*)(<\/div>[\s\S]*?)$'
    return re.sub(root_div_regex, r'\2', html)

# use it as
cleaned_html = clean_html(evil_html)

并进行单元测试

class TestBase(unittest.TestCase):
    def test_check_is_wrap_in_div(self):
        with self.subTest('test html wrap in div'):
            self.assertTrue(
                utils.check_is_wrap_in_div('<div></div>'),
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div>
                        <p>Hello</p>
                        <p>Test</p>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div class="test" style="color: blue;">
                        <p>Hello</p>
                        <p>Test</p>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div
                        class="test"
                        style="color: blue;"
                    >
                        <p>Hello</p>
                        <p>Test</p>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div
                        class="test"
                        style="color: blue;">
                        <p>Hello</p>
                        <p>Test</p>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div
                        class="test"
                        style="color: blue;">
                        <div>
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                        </div>
                        <div>
                            <p>Hi</p>
                        </div>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div
                        class="test"
                        style="color: blue;">
                        <div>
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    </div>
                """)
            )

        with self.subTest('test html not wrap in div'):
            html_list = [
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <p>HELLO</p>
                    <p>TEST</p>
                """,
                """
                    <section>
                        <div>
                            <p>hello</p>
                        </div>
                    </section>
                """,
                '<p>HELLO</p><p>TEST</p>',
                """
                    <body>
                        <div class="HELO">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;"
                        >
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <p>Hello</p>
                    <p>World</p>
                    <div class="testing">
                        Hello
                    </div>
                """,
                """
                    <div>
                        <p>Hello</p>
                        <p>World</p>
                        <p>Hello</p>
                        <p>World</p>
                    </div>
                    <div> </div>
                """,
                """
                    <div>
                        <p>Hello</p>
                        <p>World</p>
                        <p>Hello</p>
                        <p>World</p>
                    </div>
                    <div> </div>
                """,
                """
                    <div>
                        <div>
                            <p>Hello</p>
                            <p>World</p>
                            <p>Hello</p>
                            <p>World</p>
                        </div>
                    </div>
                    <span>
                        <div> </div>
                    </span>
                """,
            ]
            for html in html_list:
                self.assertFalse(
                    utils.check_is_wrap_in_div(html),
                )

    def test_remove_root_div(self):
        with self.subTest('test remove root html'):
            self.assertEqual(
                utils.remove_root_div('<div></div>'),
                '',
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div>
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    """
                ).strip(),
                """
                            <p>Hello</p>
                            <p>Test</p>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div class="test" style="color: blue;">
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    """
                ).strip(),
                """
                            <p>Hello</p>
                            <p>Test</p>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div
                            class="test"
                            style="color: blue;"
                        >
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    """
                ).strip(),
                """
                            <p>Hello</p>
                            <p>Test</p>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div
                            class="test"
                            style="color: blue;">
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    """
                ).strip(),
                """
                            <p>Hello</p>
                            <p>Test</p>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div
                            class="test"
                            style="color: blue;">
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                        </div>
                    """
                ).strip(),
                """
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """<div
                        class="test"
                        style="color: blue;">
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                        </div>
                    """
                ).strip(),
                """
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                """.strip(),
            )

        with self.subTest('test not root html'):
            html_list = [
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <p>HELLO</p>
                    <p>TEST</p>
                """,
                """
                    <section>
                        <div>
                            <p>hello</p>
                        </div>
                    </section>
                """,
                '<p>HELLO</p><p>TEST</p>',
                """
                    <body>
                        <div class="HELO">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;"
                        >
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;">
                            <p>hello</p>
                        </div>
                    </body>
                """,
            ]
            for html in html_list:
                self.assertEqual(
                    utils.remove_root_div(html),
                    html,
                )