我正在使用一个名为Caliber的程序将PDF文件转换为EPUB文件,但是结果非常混乱且不可读。诚然,EPUB文件只是HTML文件的集合,转换的结果是混乱的,因为Caliber将PDF文件的每一行都解释为
元素,这会在其中造成很多难看的换行符EPUB文件。
由于EPUB实际上是HTML文件的集合,因此可以使用Beautiful Soup对其进行解析。但是,我编写的用于查找具有“ calibre1”类(正常段落)的元素并将其组合为单个元素(这样就不会出现难看的换行符)的程序无法正常工作,我无法弄清楚为什么。
美丽汤可以处理我要做什么吗?
import os
from bs4 import BeautifulSoup
path = "C:\\Users\\Eunice\\Desktop\\eBook"
for pathname, directorynames, filenames in os.walk(path):
# Get all HTML files in the target directory
for file_name in filenames:
# Open each HTML file, which is encoded using the "Latin1" encoding scheme
with open(pathname + "\\" + file_name, 'r', encoding="Latin1") as file:
# Create a list, which we will write our new HTML tags to later
html_elem_list: list = []
# Create a BS4 object
soup = BeautifulSoup(file, 'html.parser')
# Create a list of all BS4 elements, which we will traverse in the proceeding loop
html_elements = [x for x in soup.find_all()]
for html_element in html_elements:
try:
# Find the element with a class called "calibre1," which is how Calibre designates normal body text in a book
if html_element.attrs['class'][0] in 'calibre1':
# Combine the next element with the previous element if both elements are part of the same body text
if html_elem_list[-1].attrs['class'][0] in 'calibre1':
# Remove nonbreaking spaces from this element before adding it to our list of elements
html_elem_list[-1].string = html_elem_list[-1].text.replace(
'\n', ' ') + html_element.text
# This element must not be of the "calibre1" class, so add it to the list of elements without combining it with the previous element
else:
html_elem_list.append(html_element)
# This element must not have any class, so add it to the list of elements without combining it with the previous element
except KeyError:
html_elem_list.append(html_element)
# Create a string literal, which we will eventually write to our resultant file
str_htmlfile = ''
# For each element in the list of HTML elements, append the string representation of that element (which will be a line of HTML code) to the string literal
for elem in html_elem_list:
str_htmlfile = str_htmlfile + str(elem)
# Create a new file with a distinct variation of the name of the original file, then write the resultant HTML code to that file
with open(pathname + "\\" + '_modified_' + file_name, 'wb') as file:
file.write(str_htmlfile.encode('Latin1'))
以下是输入内容:
<?xml version='1.0' encoding='Latin1'?>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<body class="calibre">
<p class="calibre5" id="calibre_pb_62">Note for Tyler</p>
<p class="calibre1">In the California registry, there was</p>
<p class="calibre1">a calm breeze blowing through the room. A woman</p>
<p class="calibre1">who must have just walked in quietly beckoned for the</p>
<p class="calibre1">counterman to approach to store her slip.</p>
<p class="calibre1">642</p>
</body></html>
这是我期望发生的事情:
<?xml version='1.0' encoding='Latin1'?>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<body class="calibre">
<p class="calibre5" id="calibre_pb_62">Note for Tyler</p>
<p class="calibre1">In the California registry, there was a calm breeze blowing through the room. A woman who must have just walked in quietly beckoned for the counterman to approach to store her slip.642</p>
</body></html>
这是实际输出:
<html lang="" xml:lang="" xmlns="http://www.w3.org/1999/xhtml">
<body class="calibre">
<p class="calibre5" id="calibre_pb_62">Note for Tyler</p>
<p class="calibre1">In the California registry, there was</p>
<p class="calibre1">a calm breeze blowing through the room. A woman</p>
<p class="calibre1">who must have just walked in quietly beckoned for the</p>
<p class="calibre1">counterman to approach to store her slip.</p>
<p class="calibre1">642</p>
</body></html><body class="calibre">
<p class="calibre5" id="calibre_pb_62">Note for Tyler</p>
<p class="calibre1">In the California registry, there was</p>
<p class="calibre1">a calm breeze blowing through the room. A woman</p>
<p class="calibre1">who must have just walked in quietly beckoned for the</p>
<p class="calibre1">counterman to approach to store her slip.</p>
<p class="calibre1">642</p>
</body><p class="calibre5" id="calibre_pb_62">Note for Tyler</p>
答案 0 :(得分:2)
这可以使用BeautifulSoup来完成,方法是使用extract()
删除不需要的<p>
元素,然后使用new_tag()
创建一个新的<p>
标签,其中包含来自所有删除的元素。例如:
html = """<?xml version='1.0' encoding='Latin1'?>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<body class="calibre">
<p class="calibre5" id="calibre_pb_62">Note for Tyler1</p>
<p class="calibre1">In the California registry, there was</p>
<p class="calibre1">a calm breeze blowing through the room. A woman</p>
<p class="calibre1">who must have just walked in quietly beckoned for the</p>
<p class="calibre1">counterman to approach to store her slip.</p>
<p class="calibre1">642</p>
<p class="calibre5" id="calibre_pb_62">Note for Tyler2</p>
<p class="calibre1">In the California registry, there was</p>
<p class="calibre1">a calm breeze blowing through the room. A woman</p>
<p class="calibre1">who must have just walked in quietly beckoned for the</p>
<p class="calibre1">counterman to approach to store her slip.</p>
<p class="calibre1">642</p>
</body></html>"""
from bs4 import BeautifulSoup
from itertools import groupby
import re
soup = BeautifulSoup(html, "html.parser")
for level, group in groupby(soup.find_all("p", class_=re.compile(r"calibre\d")), lambda x: x["class"][0]):
if level == "calibre1":
calibre1 = list(group)
p_new = soup.new_tag('p', attrs={"class" : "calibre1"})
p_new.string = ' '.join(p.get_text(strip=True) for p in calibre1)
calibre1[0].insert_before(p_new)
for p in calibre1:
p.extract()
print(soup.prettify())
将为您提供HTML格式:
<?xml version='1.0' encoding='Latin1'?>
<html lang="" xml:lang="" xmlns="http://www.w3.org/1999/xhtml">
<body class="calibre">
<p class="calibre5" id="calibre_pb_62">
Note for Tyler1
</p>
<p class="calibre1">
In the California registry, there was a calm breeze blowing through the room. A woman who must have just walked in quietly beckoned for the counterman to approach to store her slip. 642
</p>
<p class="calibre5" id="calibre_pb_62">
Note for Tyler2
</p>
<p class="calibre1">
In the California registry, there was a calm breeze blowing through the room. A woman who must have just walked in quietly beckoned for the counterman to approach to store her slip. 642
</p>
</body>
</html>
通过查找calibre1
标签的运行来工作。对于每次运行,它首先合并所有文本,然后在第一个标记之前插入一个新标记。然后,它将删除所有旧标签。
对于EPUB文件中更复杂的情况,可能需要修改逻辑,但这应该有助于您入门。
答案 1 :(得分:2)
问题:以编程方式组合某些HTML标记的内容
此示例使用lxml
解析XHTML文件并构建新的XHTML树。
import io, os
from lxml import etree
XHTML = b"""<?xml version='1.0' encoding='Latin1'?>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<body class="calibre">
<p class="calibre5" id="calibre_pb_62">Note for Tyler</p>
<p class="calibre1">In the California registry, there was</p>
<p class="calibre1">a calm breeze blowing through the room. A woman</p>
<p class="calibre1">who must have just walked in quietly beckoned for the</p>
<p class="calibre1">counterman to approach to store her slip.</p>
<p class="calibre1">642</p>
</body></html>"""
class Calibre2EPUB(etree.iterparse):
def __init__(self, fh):
"""
Initialize 'iterparse' to only generate 'start' and 'end' events
:param fh: File Handle from the XHTML File to parse
"""
super().__init__(fh, events=('start', 'end'))
self.parse()
def element(self, elem, parent=None):
"""
Copy 'elem' with attributes and text to new Element
:param elem: Source Element
:param parent: Parent of the new Element
:return: New Element
"""
if parent is None:
e = etree.Element(elem.tag, nsmap={None: etree.QName(elem).namespace})
else:
e = etree.SubElement(parent, elem.tag)
[e.set(key, elem.attrib[key]) for key in elem.attrib]
if elem.text:
e.text = elem.text
return e
def parse(self):
"""
Parse all Elements, copy Elements 1:1 except <p class:'calibre1' Element
Aggregate all <p class:'calibre1' text to one Element
:return: None
"""
self.calibre1 = None
for event, elem in self:
if event == 'start':
if elem.tag.endswith('html'):
self._xhtml = self.element(elem)
elif elem.tag.endswith('body'):
self.body = self.element(elem, parent=self._xhtml)
if event == 'end':
if elem.tag.endswith('p'):
_class = elem.attrib['class']
if not _class == 'calibre1':
p = self.element(elem, parent=self.body)
else:
if self.calibre1 is None:
self.calibre1 = self.element(elem, parent=self.body)
else:
self.calibre1.text += ' ' + elem.text
@property
def xhtml(self):
"""
:return: The new Element Tree XHTML
"""
return etree.tostring(self._xhtml, xml_declaration=True, encoding='Latin1', pretty_print=True)
用法 _
if __name__ == "__main__":
# with open(os.path.join(pathname, file_name), 'rb', encoding="Latin1") as in_file:
with io.BytesIO(XHTML) as in_file:
print(Calibre2EPUB(in_file).xhtml.decode())
#with open(os.path.join(pathname, '_modified_' + file_name), 'wb') as out_file:
# out_file.write(Calibre2EPUB(xml_file).xhtml)
输出:
<?xml version='1.0' encoding='Latin1'?> <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang=""> <body class="calibre"> <p class="calibre5" id="calibre_pb_62">Note for Tyler</p> <p class="calibre1">In the California registry, ... (omitted for brevity)to store her slip. 642</p> </body></html>
使用Python测试:3.5