我在这里有一些代码:
import math
from collections import Counter
def forSearch():
words = {'bit':{1:3,2:4,3:19,4:0},'red':{1:0,2:0,3:15,4:0},'dog':{1:3,2:0,3:4,4:5}}
search = {'bit':1,'dog':3,'shoe':5}
sizeFileVec = {}
for term, innerDict in words.iteritems():
for fileNum, appearances in innerDict.iteritems():
if not sizeFileVec.has_key(fileNum):
sizeFileVec[fileNum] = 0
sizeFileVec[fileNum] += appearances ** 2
for fileId in sizeFileVec:
sizeFileVec[fileNum] = math.sqrt(sizeFileVec[fileNum])
sizeSearchVec = 0
for term, appearances in search.iteritems():
sizeSearchVec += appearances ** 2
sizeSearchVec = math.sqrt(sizeSearchVec)
results = []
for word, occurrences in search.iteritems():
file_relevancy = Counter()
for word, innerDict in words.iteritems():
for fileNum, appear_in_file in innerDict.iteritems():
file_relevancy[fileNum] += (occurrences * appear_in_file) / (sizeFileVec[fileNum] * sizeSearchVec)
results = [fileNum for (fileNum, count) in file_relevancy.most_common(10)]
return results
print forSearch()
单词是{word:{fileNum:freq}}的字典,搜索是{word:freq}
应该计算每个文件和搜索之间的余弦距离,以便生成最相关文件的列表。但是,它没有
数学的运作方式如下:
bit dog shoe
File 1 3 3 0
File 2 4 0 0
File 3 19 4 0
File 4 0 5 0
Search 1 3 5
sim(1,S)=(3 * 1)+(3 * 3)+(0 * 5)/ sqrt(3 ^ 2 + 3 ^ 2 + 0 ^ 2)* sqrt(1 ^ 2 + 3 ^ 2 + 5 ^ 2)= 0.478
sim(2,S)=(4 * 1)+(0 * 3)+(0 * 5)/ sqrt(4 ^ 2 + 0 ^ 2 + 0 ^ 2)* sqrt(1 ^ 2 + 3 ^ 2 + 5 ^ 2)= 0.169
sim(3,S)=(19 * 1)+(4 * 3)+(0 * 5)/ sqrt(19 ^ 2 + 4 ^ 2 + 0 ^ 2)* sqrt(1 ^ 2 + 3 ^ 2 + 5 ^ 2)= 0.26987
sim(4,S)=(0 * 1)+(5 * 3)+(0 * 5)/ sqrt(0 ^ 2 + 5 ^ 2 + 0 ^ 2)* sqrt(1 ^ 2 + 3 ^ 2 + 5 ^ 2)= 0.507
因此应返回[4,1,3,2]。但是,目前正在返回[3,1,4,2]
答案 0 :(得分:1)
这个代码改为:
from math import sqrt
from collections import Counter
def forSearch():
# Here it assumes that each key in search, is also in words.
words = {'bit':{1:3,2:4,3:19,4:0},'shoe':{1:0,2:0,3:0,4:0},'dog':{1:3,2:0,3:4,4:5}, 'red':{1:0,2:0,3:15,4:0}}
search = {'bit':1,'dog':3,'shoe':5}
num_files = 4 # TODO: , figure this out programatically
# CALCULATE THE SIM(I,S), VALUES FOR I = 1 TO NUM_FILES
file_relevancy = Counter()
c = sqrt(sum([x**2 for x in search.values()]))
for i in range(1, num_files+1):
words_ith_val = [words[x][i] for x in search.keys() ]
a = sum([search[key] * words[key][i] for key in search.keys()])
b = sqrt(sum([x**2 for x in words_ith_val]))
file_relevancy[i] = (a / (b * c))
# RANK THE FILES AND RETURN
return [x[0] for x in file_relevancy.most_common(num_files)]
print forSearch()
答案 1 :(得分:1)
这是另一个版本,即使搜索词不包含在单词词典中也应该有效。
from math import sqrt
from collections import Counter
def forSearch():
words = {'bit':{1:3,2:4,3:19,4:0},'dog':{1:3,2:0,3:4,4:5}, 'red':{1:0,2:0,3:15,4:0}}
search = {'bit':1,'dog':3,'shoe':5}
num_files = 4 # TODO: , figure this out programatically
tempwords = dict(words)
# HANDLE SEARCH ITEMS THAT ARE NOT IN THE WORDS DICTIONARY
for key in search.keys():
if not tempwords.has_key(key):
tempwords[key] = {}
for i in range(1, num_files+1):
tempwords[key][i] = 0
# CALCULATE THE SIM(I,S), VALUES FOR I = 1 TO NUM_FILES
file_relevancy = Counter()
c = sqrt(sum([x**2 for x in search.values()]))
for i in range(1, num_files+1):
words_ith_val = [tempwords[x][i] for x in search.keys() ]
a = sum([search[key] * tempwords[key][i] for key in search.keys()])
b = sqrt(sum([x**2 for x in words_ith_val]))
file_relevancy[i] = (a / (b * c))
# RANK THE FILES AND RETURN
return [x[0] for x in file_relevancy.most_common(num_files)]
print forSearch()