首先,文本是n-gram到n克的列表中,例如, 每个文档的[('a','b','c'),('b','c','d'),('c','d','e')]。
然后计算TFIDF值,并为每个文档计算相似性和唯一性。
我遇到的问题是如何使用n-gram列表计算相似度并找到每个文档的唯一性和相似性。请更新以下代码以解决我的问题。
我当前正在使用以下代码。
from sklearn.feature_extraction.text import TfidfVectorizer
tokens_list = stemmed
ngramArray = []
n = 3
trigrams = ngrams(tokens_list, n)
for grams in trigrams:
ngramArray.append(grams)
documents = []
documents.append({
'name': read_file_name,
'ngram_list': ngramArray
})
texts = [x['ngram_list'] for x in documents]
tf_idf = TfidfVectorizer().fit_transform(texts)
pairwise_similarity = tf_idf * tf_idf.T
similarities = pairwise_similarity.toarray().tolist()
result_list = []
for index in range(0, len(similarity_list)):
similarities = [x for i, x in enumerate(similarity_list[index]) if i != index]
uniqueness = 1 - (max(similarities) if len(similarities) > 0 else 0)
result_list.append(uniqueness)
uniqueness = [
{
'file': documents[i]['name'],
'uniqueness': round(x * 100, 2)
} for i, x in enumerate(uniqueness)
]
docs = [
{
'file': documents[i]['name'],
'similarities': [
{
'file': documents[j]['name'],
'similarity': round(y * 100, 2)
} for j, y in enumerate(x)
]
} for i, x in enumerate(similarities)
]
for doc in docs:
doc['similarities'] = sorted(doc['similarities'], key=lambda k: k['similarity'], reverse=True)
doc['similarities'] = [x for x in doc['similarities'] if x['file'] != doc['file']][0:1]
context = {
'results': uniqueness,
'docs': docs
}
Print(context)