Python SVD脚本,用于机器学习的格式Matrix

我正在使用Movielens数据集。 ratings.dat / csv格式是

  • 用户id,MovieId,评级,时间戳
  • 1,1,5.0,52234234


 user    movie   rating
    1       43      3
    1       57      2
    2       219     4


user        1   2
movie   43  3   0
        57  2   0
        219 0   4



据我所知,为了检查相似性然后提出建议我需要一个矩阵,其中第一行(userId =" 1") 每部电影都有0-5(等级)值。


import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import pickle
data_file = pd.read_table(r'rat.csv', sep = ',', header=None,engine='python')
users = np.unique(data_file[0])
movies = np.unique(data_file[1])

number_of_rows = len(users)
number_of_columns = len(movies)

movie_indices, user_indices = {}, {}

for i in range(len(movies)):
    movie_indices[movies[i]] = i

for i in range(len(users)):
    user_indices[users[i]] = i
    #scipy sparse matrix to store the 1M matrix
V = sp.lil_matrix((number_of_rows, number_of_columns))

#adds data into the sparse matrix
for line in data_file.values:
    u, i , r , gona = map(int,line)
    V[user_indices[u], movie_indices[i]] = r
    #as these operations consume a lot of time, it's better to save processed data 
with open('movielens_1M.pickle', 'wb') as handle:
    pickle.dump(V, handle)
    #as these operations consume a lot of time, it's better to save processed data 
#gets SVD components from 10M matrix
u,s, vt = svds(V, k = 10)

with open('movielens_1M_svd_u.pickle', 'wb') as handle:
    pickle.dump(u, handle)
with open('movielens_1M_svd_s.pickle', 'wb') as handle:
    pickle.dump(s, handle)
with open('movielens_1M_svd_vt.pickle', 'wb') as handle:
    pickle.dump(vt, handle)
    s_diag_matrix = np.zeros((s.shape[0], s.shape[0]))

for i in range(s.shape[0]):
    s_diag_matrix[i,i] = s[i]
    X_lr =, s_diag_matrix), vt)

with open('movielens.pickle', 'wb') as handle:
    pickle.dump(X_lr, handle)

