我知道可以使用以下函数返回两个字符串的相似程度:
from difflib import SequenceMatcher
def similar(a, b):
output=SequenceMatcher(None, a, b).ratio()
return output
In [37]: similar("Hey, this is a test!","Hey, man, this is a test, man.")
Out[37]: 0.76
In [38]: similar("This should be one.","This should be one.")
Out[38]: 1.0
但是有可能根据键的相似性及其对应的值来获得两个词典吗?不是一些普通的键,或者 的共同点,而是从0到1的分数,就像上面带有字符串的例子一样。
我试图在此词典中找到评分[' Shane']和评分[' Joe']之间的相似性得分:
ratings={'Shane': {'127 Hours': 3.0, 'Avatar': 4.0, 'Nonstop': 5.0}, 'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0}}
我正在使用Python 2.7.10
答案 0 :(得分:4)
TZID
输出
import math
ratings={'Shane': {'127 Hours': 3.0, 'Avatar': 4.0, 'Nonstop': 5.0}, 'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0}}
def cosine_similarity(vec1,vec2):
sum11, sum12, sum22 = 0, 0, 0
for i in range(len(vec1)):
x = vec1[i]; y = vec2[i]
sum11 += x*x
sum22 += y*y
sum12 += x*y
return sum12/math.sqrt(sum11*sum22)
list1 = list(ratings['Shane'].values())
list2 = list(ratings['Joe'].values())
sim = cosine_similarity(list1,list2)
print(sim)
<强>更新强> 当我使用时:
o/p : 0.9205746178983233
输出:ratings={'Shane': {'127 Hours': 5.0, 'Avatar': 4.0, 'Nonstop': 5.0},
'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0}}
Update2:标准化长度和考虑的密钥
0.9574271077563381
输出:
from math import*
ratings={'Shane': {'127 Hours': 5.0, 'Avatar': 4.0, 'Nonstop': 5.0},
'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0},
'Bob': {'Panic Room':5.0,'Nonstop':5.0}}
def square_rooted(x):
return round(sqrt(sum([a*a for a in x])),3)
def cosine_similarity(x,y):
input1 = {}
input2 = {}
vector2 = []
vector1 =[]
if len(x) > len(y):
input1 = x
input2 = y
else:
input1 = y
input2 = x
vector1 = list(input1.values())
for k in input1.keys(): # Normalizing input vectors.
if k in input2:
vector2.append(float(input1[k]))
else :
vector2.append(float(0))
numerator = sum(a*b for a,b in zip(vector2,vector1))
denominator = square_rooted(vector1)*square_rooted(vector2)
return round(numerator/float(denominator),3)
print("Similarity between Shane and Joe")
print (cosine_similarity(ratings['Shane'],ratings['Joe']))
print("Similarity between Joe and Bob")
print (cosine_similarity(ratings['Joe'],ratings['Bob']))
print("Similarity between Shane and Bob")
print (cosine_similarity(ratings['Shane'],ratings['Bob']))
jaccurd和余弦之间的好解释:https://datascience.stackexchange.com/questions/5121/applications-and-differences-for-jaccard-similarity-and-cosine-similarity
我正在使用Python 3.4
注意:我已将0分配给缺失值。但是你也可以指定一些合适的值。请参阅:http://www.analyticsvidhya.com/blog/2015/02/7-steps-data-exploration-preparation-building-model-part-2/
答案 1 :(得分:3)
https://en.m.wikipedia.org/wiki/Jaccard_index
现在已经清理了一些示例代码。
def jac(s1,s2):
"""the jaccard index between 2 sets"""
s_union = s1.union(s2)
s_inter = s1.intersection(s2)
len_union = len(s_union)
if not len_union:
return 0
return len(s_inter)*1.0/len_union
from itertools import permutations
ratings={'Shane': {'127 Hours': 5.0, 'Avatar': 4.0, 'Nonstop': 5.0},
'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0},
'Bob': {'Panic Room':5.0,'Nonstop':5.0}}
def common_movie(dict0, dict1):
"""have we rated the same movies?"""
set0 = set(dict0.items())
set1 = set(dict1.items())
return jac(set0, set1)
def movies_and_ratings(dict0, dict1):
"""how do our movies and ratings line up?"""
set_keys0 = set(dict0.keys())
set_keys1 = set(dict1.keys())
key_commonality = jac(set_keys0, set_keys1)
set0 = set(dict0.items())
set1 = set(dict1.items())
item_commonality = jac(set0, set1)
#ok, so now we give a proximity on key match, even if key + data dont match
return 0.3 * key_commonality + 0.7 * item_commonality
def common_movie_ratings(dict0, dict1):
"""how do our ratings correspond on the same movies?"""
set_keys0 = set(dict0.keys())
set_keys1 = set(dict1.keys())
set_common = set_keys0.intersection(set_keys1)
set0 = set([v for k, v in dict0.items() if k in set_common])
set1 = set([v for k, v in dict1.items() if k in set_common])
return jac(set0, set1)
for pair in permutations(ratings.keys(), 2):
dict0, dict1 = ratings[pair[0]], ratings[pair[1]]
print "\n %s vs %s" % (pair)
#make no assumption on key/value
#order coming out of a dictionary. So, you need to order them.
li = dict0.items()
li.sort()
print " %s" % (li)
li = dict1.items()
li.sort()
print " %s" % (li)
print " common_movie :%s" % common_movie(dict0, dict1)
print " movies_and_ratings:%s" % movies_and_ratings(dict0, dict1)
print " common_movie_ratings :%s" % common_movie_ratings(dict0, dict1)
输出:
Shane vs Bob
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
[('Nonstop', 5.0), ('Panic Room', 5.0)]
common_movie :0.25
movies_and_ratings:0.25
common_movie_ratings :1.0
Shane vs Joe
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
common_movie :0.166666666667
movies_and_ratings:0.341666666667
common_movie_ratings :0.333333333333
Bob vs Shane
[('Nonstop', 5.0), ('Panic Room', 5.0)]
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
common_movie :0.25
movies_and_ratings:0.25
common_movie_ratings :1.0
Bob vs Joe
[('Nonstop', 5.0), ('Panic Room', 5.0)]
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
common_movie :0.0
movies_and_ratings:0.06
common_movie_ratings :0.0
Joe vs Shane
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
common_movie :0.166666666667
movies_and_ratings:0.341666666667
common_movie_ratings :0.333333333333
Joe vs Bob
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
[('Nonstop', 5.0), ('Panic Room', 5.0)]
common_movie :0.0
movies_and_ratings:0.06
common_movie_ratings :0.0
答案 2 :(得分:1)
这是我对上述Jaccard相似性数据科学stackexchange帖子的实现。
假设,您从集合库中获得了一个Counter输出,用于对某个键在可迭代对象中的出现次数进行计数:
d1 = {'a': 2, 'b': 1}
d2 = {'a': 1, 'c': 1}
def get_jaccard_similarity(d1,d2):
if not isinstance(d1, dict) or not isinstance(d2, dict):
raise TypeError(f'd1 and d2 should be of type dict'
f' and not {type(d1).__name__}, {type(d2).__name__}')
if not d1 and not d2:
return 1
elif (d1 and not d2) or (d2 and not d1):
return 0
else:
set_of_all_keys = {*d1.keys(), *d2.keys()}
nb_of_common_elements_dict = {k:min(d1.get(k,0),d2.get(k, 0))
for k in set_of_all_keys }
nb_of_total_elements_dict = {k: max(d1.get(k, 0), d2.get(k, 0))
for k in set_of_all_keys}
return sum(nb_of_common_elements_dict.values())/sum(nb_of_total_elements_dict.values())
输出: 0.75
数据科学stackexchange帖子基于集合的概念得出Jaccard相似性。我相信此实现将获得与集合(值等于1的字典)相同的结果,不同之处在于它使键(在两个(计数器)字典中出现)的次数具有权重优势