存在“无法索引具有零个特征的语料库(您必须在构造函数中指定num_features
或非空语料库”)
我认为这里应该出错了,但是我不明白为什么出错了。
dictionary = corpora.Dictionary(corpus)
doc_vectors = [dictionary.doc2bow(text) for text in corpus]
tfidf = models.TfidfModel(doc_vectors)
tfidf_vectors = tfidf[doc_vectors]
df = fetch_news()
stop_words = set(line.strip() for line inn open('desktop/pythontest/stopwords.txt', encoding = 'utf-8'))
for i in range(len(df)):
newslist = []
titlelist = []
corpus = []
for subject in df.article[i]:
if subject.isspace():
continue
word_list = pseg.cut(subject)
for word, flag in word_list:
if not word in stop_words and flag == 'n':
newslist.append(word)
for subject in df.title[i]:
if subject.isspace():
continue
word_list = pseg.cut(subject)
for word, flag in word_list:
if not word in stop_words and flag == 'n':
titlelist.append(word)
corpus.append(newslist)
print (len(corpus))
print("-----")
dictionary = corpora.Dictionary(corpus)
doc_vectors = [dictionary.doc2bow(text) for text in corpus]
print (doc_vectors)
tfidf = models.TfidfModel(doc_vectors)
tfidf_vectors = tfidf[doc_vectors]
for doc in tfidf_vectors:
print (doc)
print ("++++++")
print (len(tfidf_vectors))
print (tfidf_vectors[0])
query = dictionary.doc2bow(titlelist)
print (query)
index = similarities.MatrixSimilarity(tfidf_vectors)
sims = index[query]
sims = sorted(enmerate(sims), key = lambda item: -item[1])
print(sims)
print("------------")
输出为:
>1
>-----
>[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, >1), (10, 1), (11, 2), (12, 1), (13, 3), (14, 1), (15, 1), (16, 3), (17, 1), (18, 1)]]
>{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1}
>++++++
>0
>[]
>[(4, 1), (7, 1), (13, 1), (16, 1)]
>Traceback (most recent call last):
> File "desktop/pythontest/GraduationDesignCopy1.py", line 207, in <module>
> index = similarities.MatrixSimilarity(tfidf_vectors)
> File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/gensim/similarities/docsim.py", line 790, in __init__
> "cannot index a corpus with zero features (you must specify either `num_features` "
>ValueError: cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)