我怎么可能在没有协方差矩阵的情况下计算PCA?
下面的代码执行此操作:
为了比较结果,我已经计算了两个SVD。
那么为什么可以不使用均值和cov()来获取PCA的原始数据呢?
import numpy as np
from scipy.linalg import svd
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from scipy import linalg as LA
from sklearn.decomposition import PCA
import copy
# data
X_train = np.asarray([[13.71,1.86,2.36,16.6],[12.22,1.29,1.94,19],
[13.27,4.28,2.26,20],[13.16,3.57,2.15,21],
[13.86,1.51,2.67,25]])
# with covariance
X = copy.copy(X_train)
n_samples = np.shape(X)[0]
X -= np.mean(X, axis=0)
U,S,VT = svd(X)
cov_m = np.cov(X.T)
eigval, eigvec = np.linalg.eigh(cov_m)
print('with covariance')
print('S\t %s' %S)
print('S**2\t %s' %str(S**2/(n_samples-1)))
print('eigval\t %s' %np.asarray(sorted(eigval, reverse=True)))
with covariance
S [6.1900012 2.67966882 1.2864974 0.08662946]
S**2 [9.57902870e+00 1.79515624e+00 4.13768889e-01 1.87616595e-03]
eigval [9.57902870e+00 1.79515624e+00 4.13768889e-01 1.87616595e-03]
## without covariance
U1,S1,VT1 = svd(X_train)
XTX = np.dot(X_train.T, X_train)
eigval1, eigvec1 = np.linalg.eigh(XTX)
print('\n without covariance')
print('S1\t %s' %S1)
print('S1**2\t %s' %str(S1**2))
print('eigval1\t %s' %np.asarray(sorted(eigval1, reverse=True)))
with covariance:
S [6.1900012 2.67966882 1.2864974 0.08662946]
S**2 [9.57902870e+00 1.79515624e+00 4.13768889e-01 1.87616595e-03]
eigval [9.57902870e+00 1.79515624e+00 4.13768889e-01 1.87616595e-03]