我正在使用python脚本来计算关键字列表之间的余弦相似度。我运行脚本,然后检查数据。输出看起来一直到大约54行为止,此时所有余弦相似度都降至0。我可以在数据中手动验证余弦相似度不应为0。
数据集约有140,000个项目。如果需要,我可以提供更多信息,感谢您的帮助!
以下是两组示例关键字:
women's history; religious history; eighteenth-century studies; eighteenth-century literature
terrorism; utopia and dystopia; representation; cinema; cultural studies
我将以下内容用作python中余弦相似度的参考:cosine similarity between two words in a list
这是我的脚本:
# -*- coding: utf-8 -*-
import database
import pandas as pd
import math
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import sys
import re
def cosdis(v1, v2):
common = v1[1].intersection(v2[1])
if v1[2] > 0 and v2[2] > 0:
return sum(v1[0][ch] * v2[0][ch] for ch in common) / v1[2] / v2[2]
else:
return -100.0
def word2vec(word):
cw = Counter(word)
sw = set(cw)
lw = math.sqrt(sum(c * c for c in cw.values()))
return cw, sw, lw
def word2vec(word):
cw = Counter(word)
sw = set(cw)
lw = math.sqrt(sum(c * c for c in cw.values()))
return cw, sw, lw
def preprocess(keywords):
# remove special characters and digits
keywords=re.sub("(\\d|\\W)+"," ",keywords)
#lowercase
keywords = keywords.lower()
#make into a list.
keywords = keywords.split()
#remove stopwords
keywords = [word for word in keywords if word not in stopwords.words('english')]
#stem the words
ps = PorterStemmer()
keywords = [ps.stem(word) for word in keywords]
return keywords
def getBooks(connection):
return pd.read_sql('SELECT id, keyword FROM books WHERE keyword != "No Keyword"', connection)
def getWeightedCosineSimilarity(keyword_list_1, keyword_list_2):
return float(cosdis(word2vec(preprocess(keyword_list_1)), word2vec(preprocess(keyword_list_2))))
def getTopFiveSimilarKeywords(book, book_list):
similar_keywords = []
for index, other_book in book_list.iterrows():
if(other_book['id'] != book['id']):
sim = getWeightedCosineSimilarity(book['keyword'], other_book['keyword'])
similar_keywords.append({'id': other_book['id'], 'cosim': "{:10.6f}".format(sim) })
similar_keywords = sorted(similar_keywords, key=lambda k: float(k['cosim']), reverse=True)
return similar_keywords[:5]
def main():
sys.stdout.flush()
connection = database.connect()
books = getBooksconnection);
cursor = connection.cursor()
for index, book in books.iterrows():
top_five_similar_books = getTopFiveSimilarKeywords(book, books)
database.insertTopFiveCosineSimilarBooks(book['id'], top_five_similar_books, cursor)
connection.commit()
database.close(connection)
main()
当我检出数据库时,前53本书的输出如下所示:
[{"id": 18, "cosim": " 1.000000"}, {"id": 29, "cosim": " 1.000000"}, {"id": 121561, "cosim": " 0.458682"}, {"id": 121563, "cosim": " 0.458682"}, {"id": 121592, "cosim": " 0.458682"}]
然后,在其余的书中,我看到以下内容:
[{"id": 1, "cosim": " 0.000000"}, {"id": 2, "cosim": " 0.000000"}, {"id": 3, "cosim": " 0.000000"}, {"id": 5, "cosim": " 0.000000"}, {"id": 13, "cosim": " 0.000000"}]
不正确。
我不知道该如何调试。