我试图在python中使用numpy在Movie Lens数据中找到类似的用户,这样所有计算都很快。但是,我无法使用矩阵mulplications等获得最终代码来查找相似性。
以下是代码:
import pandas as pd
import numpy as np
# pass in column names for each CSV and read them using pandas.
# Column names available in the readme file
def train_test_split(ratings):
test = np.zeros(ratings.shape)
train = ratings.copy()
for user in xrange(ratings.shape[0]):
test_ratings = np.random.choice(ratings[user, :].nonzero()[0],size=10,replace=False)
train[user, test_ratings] = 0.
test[user, test_ratings] = ratings[user, test_ratings]
# Test and training are truly disjoint
assert(np.all((train * test) == 0))
return train, test
def similarity(ratings, kind='user', epsilon=1e-9):
# epsilon -> small number for handling dived-by-zero errors
if kind == 'user':
sim = ratings.dot(ratings.T) + epsilon
elif kind == 'item':
sim = ratings.T.dot(ratings) + epsilon
#Till here I have found the matrix which contains users multplications and squares with themselves..
#What shall be written to return from here ??
if __name__ == '__main__':
#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
encoding='latin-1')
#Reading ratings file:
r_cols = ['user_id', 'item_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
encoding='latin-1')
#Reading items file:
i_cols = ['item_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
encoding='latin-1')
r_cols = ['user_id', 'item_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.item_id.unique().shape[0]
rating_array = np.zeros((n_users, n_items))
for row in ratings.itertuples():
rating_array[row[1]-1, row[2]-1] = row[3]
print rating_array
sparsity = float(len(rating_array.nonzero()[0]))
sparsity = sparsity / (rating_array.shape[0] * rating_array.shape[1])
sparsity = sparsity * 100
print 'Sparsity: {:4.2f}%'.format(sparsity)
train,test=train_test_split(rating_array)
train=train[:6][:6]
user_similarity = fast_similarity(train, kind='user')
#print user_similarity[:6][:6]
因此,编写矩阵计算及其解释的任何帮助都将有助于理解如何求解这些方程。