在这里,我正在尝试使用numpy完全编码PCA以降低尺寸。 pca()函数是生成模拟数据,处理数据,绘制图形的核心。
我无法为投影生成正确的特征向量。我说这是因为以下原因 生成的两个特征向量不是正交的(如图所示)。
有人可以告诉我为什么会这样吗?
另一个令人困惑的事情是如何将数据居中?是数据中点的平均值还是列的平均值?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
def line(X):
"""
Generating the data with some noise
"""
Y = 0.5 * X + 2
random_data = Y+np.random.normal(loc = 1,scale = 5,size = X.shape[0])
return random_data
def pca():
fig,ax = plt.subplots(1)
#generating the x data
X = np.arange(0,100)
Y = line(X) # generating a simulated correlated data with some random normal noise
data = np.concatenate((X[:,np.newaxis],Y[:,np.newaxis]),axis = 1) # data preparation in the format [[x1,y1],[x2,y2],..]
average = np.mean(data,axis = 1) # mean for normalizing the data with corresponding point's mean
data = data - average[:,np.newaxis] # normalizing data
cov_data = np.cov(data.T) #covariance of the data matrix for calculating the eigen vector
eigen_values,(eigen_vector1,eigen_vector2) = np.linalg.eig(cov_data) # Eigen values and eigen vectors to project the data on
y = data[0][1]+(eigen_vector1[1]/eigen_vector1[0])*(X - data[0][0]) #equation for visualizing the eigen vector 1
y2 = data[0][1]+(eigen_vector2[1]/eigen_vector2[0])*(X - data[0][0]) #same as above for eigenvector 2
sns.scatterplot(x = X, y= line(X),ax = ax) # visualizing the data
ax.plot(X,y,label = "eigenvector1") # visualizing the eigen vector 1
ax.plot(X,y2,label = "eigenvector2")#visualizing the eigen vector 2
plt.legend()
plt.savefig('Plot.png')
plt.show()
pca()
image showing the plot of data and corresponding lines in the direction of eigenvectors