我建立了一个类来计算它能起作用的动漫嵌入层上的余弦相似度,但是我花了一段时间,大约5分钟的用户需要2分钟,我有超过1万名用户,所以我想加快我的代码
这是用于通过嵌入计算余弦的基本函数
EPSILON = 1e-07
def cosine(x, y):
dot_pdt = np.dot(x, y.T)
norms = np.linalg.norm(x) * np.linalg.norm(y)
return dot_pdt / (norms + EPSILON)
def cosine_similarities(x,embeddings):
dot_pdt = np.dot(embeddings, x)
norms = np.linalg.norm(x) * np.linalg.norm(embeddings,axis = 1)
return dot_pdt / (norms + EPSILON)
# Computes euclidean distances between x and all item embeddings
def euclidean_distances(x,embeddings):
return np.linalg.norm(embeddings - x,axis=1)
然后,该课程帮助我向用户推荐动漫,并允许我使用另一个函数来评估回想率和在k处的准确性。
class DeepNN:
MODEL_NAME = 'DeepNN_base'
def __init__(self, embeddings, anime_df=None):
self.embeddings = embeddings
self.anime_df = anime_df
def get_model_name(self):
return self.MODEL_NAME
def get_parameters(self):
ids_anime = self.anime_df["anime_id"].unique()
inverse_anime_map = {val:i for i,val in enumerate(ids_anime)}
self.anime_df["map_id"]= self.anime_df["anime_id"].map(inverse_anime_map)
namesdic = {row[1]['map_id']:row[1]['name'] for row in self.anime_df.iterrows()}
return namesdic
def _get_similar_anime(self, user_id, embeddings,top_n=10,euclidian= False):
namesdic = self.get_parameters()
if euclidian:
# eucliedian distance between idx and the rest
distance = euclidean_distances(embeddings[user_id],embeddings)
order = (distance).argsort()
order= [x for x in order if x != user_id]
order= order[:top_n]
return list(zip([namesdic[x] for x in order], distance[order]))
else:
# cosine similarity between idx and the rest
distance = cosine_similarities(embeddings[user_id], embeddings) #compute cosine
order = (-distance).argsort() #sort cosine
order= [x for x in order if x != user_id] #return all anime sorted
order= order[:top_n] #return top n anime
return list(zip([namesdic[x] for x in order], distance[order]))
def recommend_anime(self, user_id, anime_to_ignore=[], topn=10, verbose=False):
similar_anime = pd.DataFrame(self._get_similar_anime(user_id ,anime_embeddings,
top_n=5,euclidian= False)
,columns = ['name','Score'])
#Ignores items the user has already interacted
similar_anime_filtered = similar_anime.loc[~similar_anime["name"].isin(anime_to_ignore)]
recommendations_df = pd.DataFrame(similar_anime_filtered, columns=["name", "Score"]) \
.head(topn)
_to_loc = list(recommendations_df["name"]) #get all name of recommender in list
anime_to_loc = self.anime_df.loc[self.anime_df["name"].isin(_to_loc)] #match list of anime / reco
if verbose:
if self.anime_df is None:
raise Exception('"anime_df" is required in verbose mode')
recommendations_df = pd.merge(anime_to_loc, recommendations_df,left_on = "name", right_on="name")[['anime_id', "Score", 'genre',"name","type"]].drop_duplicates() #episodes, members
return recommendations_df