我正在研究一种光谱聚类算法的变体,该算法在更新的相似性矩阵W
上执行标准光谱嵌入n_iters
次,因此我必须检查标准化拉普拉矩阵的PSD { {1}}在特征分解之前由每次迭代更新L_rw
生成。 问题在于,即使初始化的W
在理论上始终是PSD,L_rw
有时也会产生负特征值(有时并不总是)。总之,arpack
不会总是保证半定对称矩阵的非负特征值。下面是用来说明我遇到的问题的示例代码。
arpack
当我运行它:
import numpy as np
import scipy.sparse as sp
from sklearn.utils.extmath import safe_sparse_dot
from scipy.sparse.linalg import arpack
def my_laplacian(W):
""" Generate normed laplacian matrix L_rw for similarity matrix W
(L_rw is according to the definition in `A Tutorial on Spectral
Clustering` by Von Luxburg, 2007)
"""
D = gen_D(W)
if not np.all(D != 0):
raise ValueError('W exists all-zero row')
L_rw = normalize_L(W, D)
if not isPSD(L_rw):
raise ValueError('L_rw isn\'t PSD')
return L_rw
def gen_D(W):
if sp.issparse(W):
return np.asarray(W.sum(axis = 0)).squeeze()
else:
return W.sum(axis = 0)
def normalize_L(W, D):
""" Normalize laplacian L with formula L_rw = I - D ^ (-1) * W
"""
tmp = np.reciprocal(D)
if sp.issparse(W):
return sp.eye(W.shape[0], format = 'csr') - safe_sparse_dot(sp.diags(tmp, 0).tocsr(), W)
else:
return np.fill_diagonal(-W, 1 - np.einsum('i,i->i', tmp, W.diagonal()))
def isPSD(L, tol = 1e-10):
""" Check wheter L is positive semi-definite
"""
vals_large, vecs_large = arpack.eigsh(L, k = 10, which = 'LM', maxiter = 6000)
vals_small, vecs_small = arpack.eigsh(L, k = 10, sigma = 0, which = 'LM', maxiter = 6000)
return np.all(vals_small > - tol) and np.all(vals_large > -tol)
然后我In [2]: X = np.random.random_integers(0, 500, size = (500, 1200))
In [3]: from sklearn.metrics.pairwise import *
In [4]: additive_chi2_kernel(X)
Out[4]:
array([[ -0. , -108450.9702963 , -110301.71485863, ...,
-114578.16996935, -106849.35599894, -102370.37161802],
[-108450.9702963 , -0. , -109129.14648767, ...,
-112274.78420127, -107293.6284921 , -103749.55524937],
[-110301.71485863, -109129.14648767, -0. , ...,
-109282.25057724, -99851.02992075, -103023.45281752],
...,
[-114578.16996935, -112274.78420127, -109282.25057724, ...,
-0. , -110133.11568012, -111218.58079982],
[-106849.35599894, -107293.6284921 , -99851.02992075, ...,
-110133.11568012, -0. , -102516.65009177],
[-102370.37161802, -103749.55524937, -103023.45281752, ...,
-111218.58079982, -102516.65009177, -0. ]])
In [5]: W = np.exp(-_ * (1. / X.shape[1]))
In [6]: W = sp.coo_matrix(W).tocsr()
In [7]: my_laplacian(W)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-9b100976a310> in <module>()
----> 1 my_laplacian(W)
<ipython-input-1-dbf198146761> in my_laplacian(W)
16 L_rw = normalize_L(W, D)
17 if not isPSD(L_rw):
---> 18 raise ValueError('L_rw isn\'t PSD')
19 return L_rw
ValueError: L_rw isn't PSD
,但没有pdb.run('L_rw = my_laplacian(W)')
ValueError
我又跑了一次:
(Pdb) n
> <ipython-input-1-dbf198146761>(17)my_laplacian()
-> if not isPSD(L_rw):
(Pdb) L_rw
<500x500 sparse matrix of type '<type 'numpy.float64'>'
with 250000 stored elements in Compressed Sparse Row format>
(Pdb) n
> <ipython-input-1-dbf198146761>(19)my_laplacian()
-> return L_rw
In [9]: my_laplacian(W)
Out[9]:
<500x500 sparse matrix of type '<type 'numpy.float64'>'
with 250000 stored elements in Compressed Sparse Row format>
In [10]: my_laplacian(W)
Out[10]:
<500x500 sparse matrix of type '<type 'numpy.float64'>'
with 250000 stored elements in Compressed Sparse Row format>
In [11]: my_laplacian(W)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-1-9b100976a310> in <module>()
----> 1 my_laplacian(W)
<ipython-input-1-dbf198146761> in my_laplacian(W)
16 L_rw = normalize_L(W, D)
17 if not isPSD(L_rw):
---> 18 raise ValueError('L_rw isn\'t PSD')
19 return L_rw
似乎不太稳定?或者我弄错了什么?有没有人可以解决一些问题?