Question

我正在研究一种光谱聚类算法的变体，该算法在更新的相似性矩阵W上执行标准光谱嵌入n_iters次，因此我必须检查标准化拉普拉矩阵的PSD { {1}}在特征分解之前由每次迭代更新L_rw生成。 问题在于，即使初始化的W在理论上始终是PSD，L_rw有时也会产生负特征值（有时并不总是）。总之，arpack不会总是保证半定对称矩阵的非负特征值。下面是用来说明我遇到的问题的示例代码。

arpack

当我运行它：

import numpy as np
import scipy.sparse as sp
from sklearn.utils.extmath import safe_sparse_dot
from scipy.sparse.linalg import arpack
def my_laplacian(W):
    """ Generate normed laplacian matrix L_rw for similarity matrix W 
        (L_rw is according to the definition in `A Tutorial on Spectral 
        Clustering` by Von Luxburg, 2007)
    """
    D = gen_D(W)
    if not np.all(D != 0):
        raise ValueError('W exists all-zero row')
    L_rw = normalize_L(W, D)
    if not isPSD(L_rw):
        raise ValueError('L_rw isn\'t PSD')
    return L_rw
def gen_D(W):
    if sp.issparse(W):
        return np.asarray(W.sum(axis = 0)).squeeze()
    else:
        return W.sum(axis = 0)
def normalize_L(W, D):
    """ Normalize laplacian L with formula L_rw = I - D ^ (-1) * W
    """
    tmp = np.reciprocal(D)
    if sp.issparse(W):
        return sp.eye(W.shape[0], format = 'csr') - safe_sparse_dot(sp.diags(tmp, 0).tocsr(), W)
    else:
        return np.fill_diagonal(-W, 1 - np.einsum('i,i->i', tmp, W.diagonal()))
def isPSD(L, tol = 1e-10):
    """ Check wheter L is positive semi-definite
    """
    vals_large, vecs_large = arpack.eigsh(L, k = 10, which = 'LM', maxiter = 6000)
    vals_small, vecs_small = arpack.eigsh(L, k = 10, sigma = 0, which = 'LM', maxiter = 6000)
    return np.all(vals_small > - tol) and np.all(vals_large > -tol)

然后我In [2]: X = np.random.random_integers(0, 500, size = (500, 1200)) In [3]: from sklearn.metrics.pairwise import * In [4]: additive_chi2_kernel(X) Out[4]: array([[ -0. , -108450.9702963 , -110301.71485863, ..., -114578.16996935, -106849.35599894, -102370.37161802], [-108450.9702963 , -0. , -109129.14648767, ..., -112274.78420127, -107293.6284921 , -103749.55524937], [-110301.71485863, -109129.14648767, -0. , ..., -109282.25057724, -99851.02992075, -103023.45281752], ..., [-114578.16996935, -112274.78420127, -109282.25057724, ..., -0. , -110133.11568012, -111218.58079982], [-106849.35599894, -107293.6284921 , -99851.02992075, ..., -110133.11568012, -0. , -102516.65009177], [-102370.37161802, -103749.55524937, -103023.45281752, ..., -111218.58079982, -102516.65009177, -0. ]]) In [5]: W = np.exp(-_ * (1. / X.shape[1])) In [6]: W = sp.coo_matrix(W).tocsr() In [7]: my_laplacian(W) --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-7-9b100976a310> in <module>() ----> 1 my_laplacian(W) <ipython-input-1-dbf198146761> in my_laplacian(W) 16 L_rw = normalize_L(W, D) 17 if not isPSD(L_rw): ---> 18 raise ValueError('L_rw isn\'t PSD') 19 return L_rw ValueError: L_rw isn't PSD，但没有pdb.run('L_rw = my_laplacian(W)')

ValueError

我又跑了一次：

(Pdb) n
> <ipython-input-1-dbf198146761>(17)my_laplacian()
-> if not isPSD(L_rw):
(Pdb) L_rw
<500x500 sparse matrix of type '<type 'numpy.float64'>'
    with 250000 stored elements in Compressed Sparse Row format>
(Pdb) n
> <ipython-input-1-dbf198146761>(19)my_laplacian()
-> return L_rw

In [9]: my_laplacian(W) Out[9]: <500x500 sparse matrix of type '<type 'numpy.float64'>' with 250000 stored elements in Compressed Sparse Row format> In [10]: my_laplacian(W) Out[10]: <500x500 sparse matrix of type '<type 'numpy.float64'>' with 250000 stored elements in Compressed Sparse Row format> In [11]: my_laplacian(W) --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-1-9b100976a310> in <module>() ----> 1 my_laplacian(W) <ipython-input-1-dbf198146761> in my_laplacian(W) 16 L_rw = normalize_L(W, D) 17 if not isPSD(L_rw): ---> 18 raise ValueError('L_rw isn\'t PSD') 19 return L_rw似乎不太稳定？或者我弄错了什么？有没有人可以解决一些问题？

scipy.sparse.linalg.arpack.eigsh对正半正定稀疏矩阵的特征分解的稳定性？

0 个答案: