我正在尝试使用潜在因子模型构建玩具推荐系统。所以我只需从(http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/)复制代码并运行它。一切都很好。然后我使用字典重写代码,出现问题,我无法理解。
原始代码在这里:
import numpy
def matrix_factorization(R, P, Q, K, steps=10000, alpha=0.0002, beta=0.02):
Q = Q.T
for step in xrange(steps):
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
for k in xrange(K):
P_temp = P[i][k]
Q_temp = Q[k][j]
P[i][k] = P_temp + alpha * (2 * eij * Q_temp - beta * P_temp)
Q[k][j] = Q_temp + alpha * (2 * eij * P_temp - beta * Q_temp)
eR = numpy.dot(P,Q)
e = 0
for i in xrange(len(R)):
for j in xrange(len(R[i])):
if R[i][j] > 0:
e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
for k in xrange(K):
e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
if e < 0.001:
break
print 'step',step,'error:',e
step += 1
print e
return P, Q.T
R = [
[5,3,0,1],
[4,0,0,1],
[1,1,0,5],
[1,0,0,4],
[0,1,5,4],
]
R = numpy.array(R)
N = len(R)
M = len(R[0])
K = 2
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)
您可以在每个步骤中看到错误较小。
step 4976 error: 1.26505397722
step 4977 error: 1.26497866139
step 4978 error: 1.26490338911
step 4979 error: 1.26482816036
step 4980 error: 1.2647529751
step 4981 error: 1.26467783333
step 4982 error: 1.264602735
step 4983 error: 1.26452768009
step 4984 error: 1.26445266858
step 4985 error: 1.26437770044
step 4986 error: 1.26430277565
我的代码:
import random
def matrix_factorization(R, P, Q, K,steps=5000, alpha=0.0002, beta=0.02):
for step in xrange(steps):
for i in R.keys():
for j in R[i].keys():
eij = R[i][j] - sum([x * y for x in P[i] for y in Q[j]])
for k in xrange(K):
P_temp = P[i][k]
Q_temp = Q[j][k]
P[i][k] = P_temp + alpha * (2 * eij * Q_temp - beta * P_temp)
Q[j][k] = Q_temp + alpha * (2 * eij * P_temp - beta * Q_temp)
#print 'P,Q',P[i][k],Q[k][j]
e = 0
for i in R.keys():
for j in R[i].keys():
e += pow(R[i][j] - sum([x * y for x in P[i] for y in Q[j]]), 2)
for k in xrange(K):
e += (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
if e < 0.001:
break
print 'step',step,'error:',e
step += 1
print e
return P,Q
R = {0:{0:5,1:3,3:1},
1:{0:4,3:1},
2:{0:1,1:1,3:5},
3:{0:1,3:4},
4:{1:1,2:5,3:4}
}
N = 5
M = 4
K = 4
P = dict()
Q = dict()
for i in xrange(N):
P[i] = [random.random() for x in xrange(K)]
for j in xrange(M):
Q[j] = [random.random() for x in xrange(K)]
P,Q = matrix_factorization(R,P,Q,K)
与上一节几乎相同。唯一的区别是我用字典写的。但它显示:
step 4944 error: 12786002.1942
step 4945 error: 12838370.3896
step 4946 error: 12890953.0588
step 4947 error: 12943751.0801
step 4948 error: 12996765.3355
step 4949 error: 13049996.7107
step 4950 error: 13103446.0947
step 4951 error: 13157114.3803
step 4952 error: 13211002.4639
step 4953 error: 13265111.2458
step 4954 error: 13319441.6297
step 4955 error: 13373994.5232
step 4956 error: 13428770.8375
step 4957 error: 13483771.4875
step 4958 error: 13538997.392
step 4959 error: 13594449.4735
step 4960 error: 13650128.6582
step 4961 error: 13706035.8761
step 4962 error: 13762172.0611
step 4963 error: 13818538.1509
step 4964 error: 13875135.0871
step 4965 error: 13931963.8149
step 4966 error: 13989025.2837
step 4967 error: 14046320.4465
step 4968 error: 14103850.2604
step 4969 error: 14161615.6864
step 4970 error: 14219617.6893
step 4971 error: 14277857.2379
step 4972 error: 14336335.3052
step 4973 error: 14395052.8678
step 4974 error: 14454010.9066
错误在每一步都变得越来越大。我对它很困惑。
非常感谢你的时间!
答案 0 :(得分:2)
我认为这个更好
def matrix_factorization(R, P, Q, K, steps=10000, alpha=0.002, beta=0.02):
Q = Q.T
Indi = numpy.copy(R)
Indi[Indi<>0] = 1
for step in xrange(steps):
Pred = P.dot(Q)
_Pred = numpy.multiply(Indi, Pred)
E = R - _Pred
P_tmp = numpy.copy(P)
Q_tmp = numpy.copy(Q)
P = P_tmp + alpha*(E.dot(Q_tmp.T) - beta*P_tmp)
Q = Q_tmp + alpha*(P_tmp.T.dot(E) - beta*Q_tmp)
rmse = numpy.sqrt(E.ravel().dot(E.flat) / len(Indi[Indi.nonzero()]))
print 'step:%s'%step
print "RMSE:", rmse
return P, Q.T
答案 1 :(得分:0)
使用numpy.dot
函数替换sum
函数似乎是以错误的方式完成的。
另外我认为e += ... + pow(Q[k][j], 2)
应该是e += ... + pow(Q[j][k], 2)
。
我按如下方式更改了函数matrix_factorization
,然后结果似乎正确,您可以看到每一步中的错误都较小。
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
for step in xrange(steps):
for i in R.keys():
for j in R[i].keys():
eij = R[i][j] - sum([P[i][k] * Q[j][k] for k in xrange(K)])
for k in xrange(K):
P_temp = P[i][k]
Q_temp = Q[j][k]
P[i][k] = P_temp + alpha * (2 * eij * Q_temp - beta * P_temp)
Q[j][k] = Q_temp + alpha * (2 * eij * P_temp - beta * Q_temp)
e = 0
for i in R.keys():
for j in R[i].keys():
e += pow(R[i][j] - sum([P[i][k] * Q[j][k] for k in xrange(K)]), 2)
for k in xrange(K):
e += (beta/2) * (pow(P[i][k], 2) + pow(Q[j][k], 2))
if e < 0.001:
break
return P,Q