from bs4 import BeautifulSoup, Comment
import re
id = 1
def makeformat(path):
global id
html = open(path, "r", encoding="utf-8")
soup = BeautifulSoup(html.read(), "lxml", from_encoding="utf-8")
html.close()
with open("document.txt", "a", encoding="utf-8") as f:
path = str(path.replace("./document/", "").replace("\\", "/"))
f.write("<DOC>\n<ID> " + str(id) + " </ID>\n")
f.write("<URL> http://" + path + " </URL>\n")
if not(soup.title is None):
f.write("<TITLE>" + str(soup.title.get_text(strip=True)) + "</TITLE>\n")
[tag.extract() for tag in soup.find_all(['script', 'style', 'noscript'])]
[comment.extract() for comment in soup.findAll(text=lambda text: isinstance(text, Comment))]
f.write("<BODY>\n")
for line in soup.body.strings:
f.write(re.sub("\s\s+", "\n", repr(line).rstrip()).replace("\'", "") + "\n")
f.write("\n</BODY>\n</DOC>\n\n")
id += 1
我想制作一种XML格式,它使用body标签中的字符串来删除样式,脚本和注释。
我尝试使用BeautifulSoup4删除标记,但是我遇到了以下错误:
AttributeError:&#39; NoneType&#39;对象没有属性&#39; next_element&#39;
为什么我会收到此错误,我该如何解决?
如果我不运行extract()
删除标记,则代码可以使用。