我尝试使用漂亮的汤在.txt文件中抓取每个标签(在我的列表中)之间的每个单独文本并将它们存储到字典中。如果我运行大文件,这段代码可以正常工作但速度非常慢,那么还有另外一种方法可以让这段代码变得更快吗?
from bs4 import BeautifulSoup
words_dict = dict()
# these are all of the tags in the file I'm looking for
tags_list = ['title', 'h1', 'h2', 'h3', 'b', 'strong']
def grab_file_content(file : str):
with open(file, encoding = "utf-8") as file_object:
# entire content of the file with tags
content = BeautifulSoup(file_object, 'html.parser')
# if the content has content within the <body> tags...
if content.body:
for tag in tags_list:
for tags in content.find_all(tag):
text_list = tags.get_text().strip().split(" ")
for words in text_list:
if words in words_dict:
words_dict[words] += 1
else:
words_dict[words] = 1
else:
print('no body')
答案 0 :(得分:1)
以下代码在功能上与您的代码等效:
from collections import Counter
from itertools import chain
words_dict = Counter() # An empty counter further used as an accumulator
# Probably a loop
# Create the soup here, as in your original code
content = BeautifulSoup(file_object, 'html.parser')
words_dict += Counter(chain.from_iterable(tag.string.split()
for tag in content.find_all(tags_list) if tag.string))