top_n推荐的速度优化

时间:2018-08-01 08:21:39

标签: python pandas deep-learning recommendation-engine recommender-systems

我建立了一个类来计算它能起作用的动漫嵌入层上的余弦相似度,但是我花了一段时间,大约5分钟的用户需要2分钟,我有超过1万名用户,所以我想加快我的代码

这是用于通过嵌入计算余弦的基本函数

EPSILON = 1e-07

def cosine(x, y):
    dot_pdt = np.dot(x, y.T)
    norms = np.linalg.norm(x) * np.linalg.norm(y)
    return dot_pdt / (norms + EPSILON)

def cosine_similarities(x,embeddings):
    dot_pdt = np.dot(embeddings, x)
    norms = np.linalg.norm(x) * np.linalg.norm(embeddings,axis = 1)
    return dot_pdt / (norms + EPSILON)

# Computes euclidean distances between x and all item embeddings
def euclidean_distances(x,embeddings):  
    return np.linalg.norm(embeddings - x,axis=1)

然后,该课程帮助我向用户推荐动漫,并允许我使用另一个函数来评估回想率和在k处的准确性。

class DeepNN:

    MODEL_NAME = 'DeepNN_base'

    def __init__(self, embeddings, anime_df=None):
        self.embeddings = embeddings
        self.anime_df = anime_df

    def get_model_name(self):
        return self.MODEL_NAME

    def get_parameters(self):
        ids_anime = self.anime_df["anime_id"].unique()
        inverse_anime_map = {val:i for i,val in enumerate(ids_anime)}
        self.anime_df["map_id"]= self.anime_df["anime_id"].map(inverse_anime_map)
        namesdic = {row[1]['map_id']:row[1]['name'] for row in self.anime_df.iterrows()}
        return namesdic

    def _get_similar_anime(self, user_id, embeddings,top_n=10,euclidian= False):
        namesdic =  self.get_parameters()
        if euclidian:
            # eucliedian distance between idx and the rest
            distance = euclidean_distances(embeddings[user_id],embeddings)
            order = (distance).argsort()
            order= [x for x in order if x != user_id]
            order= order[:top_n]
            return list(zip([namesdic[x] for x in order], distance[order]))
        else: 
            # cosine similarity between idx and the rest
            distance = cosine_similarities(embeddings[user_id], embeddings) #compute cosine
            order = (-distance).argsort() #sort cosine
            order= [x for x in order if x != user_id] #return all anime sorted
            order= order[:top_n] #return top n anime
            return list(zip([namesdic[x] for x in order], distance[order]))

    def recommend_anime(self, user_id, anime_to_ignore=[], topn=10, verbose=False):
        similar_anime = pd.DataFrame(self._get_similar_anime(user_id ,anime_embeddings,
                                                             top_n=5,euclidian= False)
                                     ,columns = ['name','Score'])
        #Ignores items the user has already interacted
        similar_anime_filtered =  similar_anime.loc[~similar_anime["name"].isin(anime_to_ignore)]
        recommendations_df = pd.DataFrame(similar_anime_filtered, columns=["name", "Score"]) \
        .head(topn)
        _to_loc = list(recommendations_df["name"]) #get all name of recommender in list
        anime_to_loc = self.anime_df.loc[self.anime_df["name"].isin(_to_loc)] #match list of anime / reco             

        if verbose:
            if self.anime_df is None:
                raise Exception('"anime_df" is required in verbose mode')

        recommendations_df = pd.merge(anime_to_loc, recommendations_df,left_on = "name", right_on="name")[['anime_id', "Score", 'genre',"name","type"]].drop_duplicates() #episodes, members

        return recommendations_df

0 个答案:

没有答案