我想知道是否有可能使prettify
没有在特定标签上创建新行。
我想这样做,span
和a
标签不会分开,例如:
doc="""<div><div><span>a</span><span>b</span>
<a>link</a></div><a>link1</a><a>link2</a></div>"""
from bs4 import BeautifulSoup as BS
soup = BS(doc)
print soup.prettify()
下面是我要打印的内容:
<div>
<div>
<span>a</span><span>b</span>
<a>link</a>
</div>
<a>link1</a><a>link2</a>
</div>
但这是实际打印的内容:
<div>
<div>
<span>
a
</span>
<span>
b
</span>
<a>
link
</a>
</div>
<a>
link1
</a>
<a>
link2
</a>
</div>
在新行上放置内联样式标记实际上会在它们之间添加空格,稍微改变实际页面的外观。我会把你链接到两个显示差异的jsfiddles:
如果您想知道为什么这对BeautifulSoup很重要,那是因为我正在编写一个网页调试器,并且美化功能将非常有用(以及bs4中的其他内容)。但如果我对文件进行美化,那么我就冒险改变一些事情。
那么,有没有办法自定义prettify
功能,以便我可以将其设置为不分解某些代码?
答案 0 :(得分:12)
我发布了一个快速黑客,而我找不到更好的解决方案。
我实际上是在我的项目中使用它来避免破坏textareas和pre标签。将['span','a']替换为您要防止缩进的标记。
markup = """<div><div><span>a</span><span>b</span>
<a>link</a></div><a>link1</a><a>link2</a></div>"""
# Double curly brackets to avoid problems with .format()
stripped_markup = markup.replace('{','{{').replace('}','}}')
stripped_markup = BeautifulSoup(stripped_markup)
unformatted_tag_list = []
for i, tag in enumerate(stripped_markup.find_all(['span', 'a'])):
unformatted_tag_list.append(str(tag))
tag.replace_with('{' + 'unformatted_tag_list[{0}]'.format(i) + '}')
pretty_markup = stripped_markup.prettify().format(unformatted_tag_list=unformatted_tag_list)
print pretty_markup
答案 1 :(得分:4)
简短的回答是否定的。
答案越长越容易。
我还在使用bs3,所以这是针对bs3的黑客攻击。我正在将其移植到bs4。
它主要涉及继承Tag和BeautifulSoup并重载美化(和相关)方法。
代码:
import sys
import BeautifulSoup
class Tag(BeautifulSoup.Tag):
def __str__(self, encoding=BeautifulSoup.DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0, pprint_exs=[]):
"""Returns a string or Unicode representation of this tag and
its contents. To get Unicode, pass None for encoding.
NOTE: since Python's HTML parser consumes whitespace, this
method is not certain to reproduce the whitespace present in
the original string."""
encodedName = self.toEncoding(self.name, encoding)
unflatten_here = (not self.name in pprint_exs)
attrs = []
if self.attrs:
for key, val in self.attrs:
fmt = '%s="%s"'
if isinstance(val, basestring):
if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
val = self.substituteEncoding(val, encoding)
# The attribute value either:
#
# * Contains no embedded double quotes or single quotes.
# No problem: we enclose it in double quotes.
# * Contains embedded single quotes. No problem:
# double quotes work here too.
# * Contains embedded double quotes. No problem:
# we enclose it in single quotes.
# * Embeds both single _and_ double quotes. This
# can't happen naturally, but it can happen if
# you modify an attribute value after parsing
# the document. Now we have a bit of a
# problem. We solve it by enclosing the
# attribute in single quotes, and escaping any
# embedded single quotes to XML entities.
if '"' in val:
fmt = "%s='%s'"
if "'" in val:
# TODO: replace with apos when
# appropriate.
val = val.replace("'", "&squot;")
# Now we're okay w/r/t quotes. But the attribute
# value might also contain angle brackets, or
# ampersands that aren't part of entities. We need
# to escape those to XML entities too.
val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
attrs.append(fmt % (self.toEncoding(key, encoding),
self.toEncoding(val, encoding)))
close = ''
closeTag = ''
if self.isSelfClosing:
close = ' /'
else:
closeTag = '</%s>' % encodedName
prev = self.findPrevious(lambda x: isinstance(x, Tag))
prev_sib = self.findPreviousSibling(lambda x: isinstance(x, Tag))
ex_break_detected = (self.name != prev_sib.name) if(prev_sib and prev_sib.name in pprint_exs) else False
break_detected = (self.name != prev.name) if(prev) else False
indentTag, indentContents = 0, 0
if prettyPrint:
if(break_detected or unflatten_here):
indentContents = indentLevel + 1
indentTag = indentLevel
space = (' ' * (indentTag-1))
contents = self.renderContents(encoding, prettyPrint, indentContents, pprint_exs, unflatten_here)
if self.hidden:
s = contents
else:
s = []
attributeString = ''
if attrs:
attributeString = ' ' + ' '.join(attrs)
if prettyPrint and ex_break_detected and not unflatten_here:
s.append("\n")
if prettyPrint and (unflatten_here or break_detected):
s.append(space)
s.append('<%s%s%s>' % (encodedName, attributeString, close))
if prettyPrint and unflatten_here:
s.append("\n")
s.append(contents)
if prettyPrint and contents and contents[-1] != "\n" and unflatten_here:
s.append("\n")
if prettyPrint and closeTag and unflatten_here:
s.append(space)
s.append(closeTag)
if prettyPrint and closeTag and self.nextSibling and unflatten_here:
s.append("\n")
if prettyPrint and isinstance(self.nextSibling, Tag) and self.nextSibling.name != self.name and not unflatten_here:
s.append("\n")
s = ''.join(s)
return s
def renderContents(self, encoding=BeautifulSoup.DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0, pprint_exs=[], unflatten=True):
"""Renders the contents of this tag as a string in the given
encoding. If encoding is None, returns a Unicode string.."""
s=[]
for c in self:
text = None
if isinstance(c, BeautifulSoup.NavigableString):
text = c.__str__(encoding)
elif isinstance(c, Tag):
s.append(c.__str__(encoding, prettyPrint, indentLevel, pprint_exs))
if text and prettyPrint:
text = text.strip()
if text:
if prettyPrint and unflatten:
s.append(" " * (indentLevel-1))
s.append(text)
if prettyPrint and unflatten:
s.append("\n")
return ''.join(s)
BeautifulSoup.Tag = Tag
class BeautifulStoneSoup(Tag, BeautifulSoup.BeautifulStoneSoup):
pass
BeautifulSoup.BeautifulStoneSoup = BeautifulStoneSoup
class PumpkinSoup(BeautifulStoneSoup, BeautifulSoup.BeautifulSoup):
def __init__(self, *args, **kwargs):
self.pprint_exs = kwargs.pop("pprint_exs", [])
super(BeautifulSoup.BeautifulSoup, self).__init__(*args, **kwargs)
def prettify(self, encoding=BeautifulSoup.DEFAULT_OUTPUT_ENCODING):
return self.__str__(encoding, True, pprint_exs=self.pprint_exs)
doc = \
'''
<div>
<div>
<span>a</span><span>b</span>
<a>link1</a>
<a>link2</a>
<span>c</span>
</div>
<a>link3</a><a>link4</a>
</div>
'''
soup = PumpkinSoup(doc, pprint_exs = ["a", "span"])
print soup.prettify()