使用字典进行矩阵分解和因子向量越来越大

时间:2014-03-19 15:51:43

标签: python dictionary matrix-factorization

我正在尝试使用潜在因子模型构建玩具推荐系统。所以我只需从(http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/)复制代码并运行它。一切都很好。然后我使用字典重写代码,出现问题,我无法理解。

原始代码在这里:

import numpy

def matrix_factorization(R, P, Q, K, steps=10000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):

        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:

                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P_temp = P[i][k]
                        Q_temp = Q[k][j]

                        P[i][k] = P_temp + alpha * (2 * eij * Q_temp - beta * P_temp)
                        Q[k][j] = Q_temp + alpha * (2 * eij * P_temp - beta * Q_temp)
        eR = numpy.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
        print 'step',step,'error:',e
        step += 1

    print e
    return P, Q.T

R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]

R = numpy.array(R)

N = len(R)
M = len(R[0])
K = 2

P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)

您可以在每个步骤中看到错误较小。

step 4976 error: 1.26505397722
step 4977 error: 1.26497866139
step 4978 error: 1.26490338911
step 4979 error: 1.26482816036
step 4980 error: 1.2647529751
step 4981 error: 1.26467783333
step 4982 error: 1.264602735
step 4983 error: 1.26452768009
step 4984 error: 1.26445266858
step 4985 error: 1.26437770044
step 4986 error: 1.26430277565

我的代码:

import random


def matrix_factorization(R, P, Q, K,steps=5000, alpha=0.0002, beta=0.02):
    for step in xrange(steps):

        for i in R.keys():
            for j in R[i].keys():
                eij = R[i][j] - sum([x * y for x in P[i] for y in Q[j]])
                for k in xrange(K):
                    P_temp = P[i][k]
                    Q_temp = Q[j][k]

                    P[i][k] = P_temp + alpha * (2 * eij * Q_temp - beta * P_temp)
                    Q[j][k] = Q_temp + alpha * (2 * eij * P_temp - beta * Q_temp)
                    #print 'P,Q',P[i][k],Q[k][j]

        e = 0
        for i in R.keys():
            for j in R[i].keys():
                e += pow(R[i][j] - sum([x * y for x in P[i] for y in Q[j]]), 2)
                for k in xrange(K):
                    e += (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))

        if e < 0.001:
            break

        print 'step',step,'error:',e
        step += 1

    print e
    return P,Q



R = {0:{0:5,1:3,3:1},
     1:{0:4,3:1},
     2:{0:1,1:1,3:5},
     3:{0:1,3:4},
     4:{1:1,2:5,3:4}
     }

N = 5
M = 4
K = 4

P = dict()
Q = dict()

for i in xrange(N):
    P[i] = [random.random() for x in xrange(K)]

for j in xrange(M):
    Q[j] = [random.random() for x in xrange(K)]

P,Q = matrix_factorization(R,P,Q,K)

与上一节几乎相同。唯一的区别是我用字典写的。但它显示:

step 4944 error: 12786002.1942
step 4945 error: 12838370.3896
step 4946 error: 12890953.0588
step 4947 error: 12943751.0801
step 4948 error: 12996765.3355
step 4949 error: 13049996.7107
step 4950 error: 13103446.0947
step 4951 error: 13157114.3803
step 4952 error: 13211002.4639
step 4953 error: 13265111.2458
step 4954 error: 13319441.6297
step 4955 error: 13373994.5232
step 4956 error: 13428770.8375
step 4957 error: 13483771.4875
step 4958 error: 13538997.392
step 4959 error: 13594449.4735
step 4960 error: 13650128.6582
step 4961 error: 13706035.8761
step 4962 error: 13762172.0611
step 4963 error: 13818538.1509
step 4964 error: 13875135.0871
step 4965 error: 13931963.8149
step 4966 error: 13989025.2837
step 4967 error: 14046320.4465
step 4968 error: 14103850.2604
step 4969 error: 14161615.6864
step 4970 error: 14219617.6893
step 4971 error: 14277857.2379
step 4972 error: 14336335.3052
step 4973 error: 14395052.8678
step 4974 error: 14454010.9066

错误在每一步都变得越来越大。我对它很困惑。

非常感谢你的时间!

2 个答案:

答案 0 :(得分:2)

我认为这个更好

def matrix_factorization(R, P, Q, K, steps=10000, alpha=0.002, beta=0.02):
    Q = Q.T
    Indi = numpy.copy(R)
    Indi[Indi<>0] = 1
    for step in xrange(steps):
        Pred = P.dot(Q)
        _Pred = numpy.multiply(Indi, Pred)
        E = R -  _Pred
        P_tmp = numpy.copy(P)
        Q_tmp = numpy.copy(Q)
        P = P_tmp + alpha*(E.dot(Q_tmp.T) - beta*P_tmp)
        Q = Q_tmp + alpha*(P_tmp.T.dot(E) - beta*Q_tmp)
        rmse = numpy.sqrt(E.ravel().dot(E.flat) / len(Indi[Indi.nonzero()]))
        print 'step:%s'%step
        print "RMSE:", rmse
    return P, Q.T

答案 1 :(得分:0)

使用numpy.dot函数替换sum函数似乎是以错误的方式完成的。 另外我认为e += ... + pow(Q[k][j], 2)应该是e += ... + pow(Q[j][k], 2)

我按如下方式更改了函数matrix_factorization,然后结果似乎正确,您可以看到每一步中的错误都较小。

def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    for step in xrange(steps):
        for i in R.keys():
            for j in R[i].keys():
                eij = R[i][j] - sum([P[i][k] * Q[j][k] for k in xrange(K)])
                for k in xrange(K):
                    P_temp = P[i][k]
                    Q_temp = Q[j][k]

                    P[i][k] = P_temp + alpha * (2 * eij * Q_temp - beta * P_temp)
                    Q[j][k] = Q_temp + alpha * (2 * eij * P_temp - beta * Q_temp)

        e = 0
        for i in R.keys():
            for j in R[i].keys():
                e += pow(R[i][j] - sum([P[i][k] * Q[j][k] for k in xrange(K)]), 2)
                for k in xrange(K):
                    e += (beta/2) * (pow(P[i][k], 2) + pow(Q[j][k], 2))
        if e < 0.001:
            break
    return P,Q