我正在尝试读取数据集并应用一些信息检索代码。 这段代码应该读取字符串列表计算并呈现10个最常用的单词及其频率。代码似乎正在工作,但由于数据集非常大,因此需要永远计算结果(特别是第15行)。有关如何使其表现更快的任何建议?感谢
Summaries_file = 'data/aspirin__Summaries.pkl.bz2'
Summaries = pickle.load( bz2.BZ2File( Summaries_file, 'rb' ) )
paper = namedtuple( 'paper', ['title', 'authors', 'year', 'doi'] )
for (id, paper_info) in Summaries.items():
Summaries[id] = paper( *paper_info )
from collections import OrderedDict
the_list = []
a=[]
v=''
for p in Summaries:
my_list = (Summaries[p].title)
the_list.append(my_list)
a = " ".join(the_list).split()
wordfreq = []
for w in a:
wordfreq.append(a.count(w))
final = (sorted([i for i in zip(a, wordfreq)],reverse=True,key=lambda x:x[1]))
s = []
for i in final:
if i not in s:
s.append(i)
print(s[0:10])