我已经阅读了一些类似的帖子,但没有找到与我的问题完全相同的内容。
我正在计算tfidf值的余弦相似度,并且正在获得ValueError: too many values to unpack (expected 2)
据我所知,这基本上是说预期有2个值,并且有超过2个值被输入,导致错误。但我不确定如何从这里前进。
这是完整的代码
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
def find_similar(tfidf_matrix, index, top_n = 5):
cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]
corpus = []
for doc in nlp.pipe(df['TIP_all_txt'].astype('unicode').values, batch_size=9845,
n_threads=3):
if doc.is_parsed:
corpus.append([n.text for n in doc if not n.is_punct and not n.is_stop and not n.is_space])
else:
corpus.append(None)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')
tfidf_matrix = tf.fit_transform([content for doc, doc in corpus])
for me_index, item in enumerate(corpus):
similar_documents = [(corpus[index], score) for index, score in find_similar(tfidf_matrix, me_index)]
me = corpus[me_index]
document_id = me[0].split("/")[1].split(".")[0]
for ((raw_similar_document_id, title), score) in similar_documents:
similar_document_id = raw_similar_document_id.split("/")[1].split(".")[0]
writer.writerow([document_id, me[1], similar_document_id, title, score])
和错误消息:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-75-8522d91e96f8> in <module>()
8
9 tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')
---> 10 tfidf_matrix = tf.fit_transform([content for doc, doc in corpus])
11
12 for me_index, item in enumerate(corpus):
<ipython-input-75-8522d91e96f8> in <listcomp>(.0)
8
9 tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')
---> 10 tfidf_matrix = tf.fit_transform([content for doc, doc in corpus])
11
12 for me_index, item in enumerate(corpus):
ValueError: too many values to unpack (expected 2)