我正在尝试编写一个函数,该函数根据前两个主成分计算 PCA 的 95% 预测椭圆的中心、宽度、高度和旋转,并使用 p <= 的截止值检测异常值0.001 基于 5 个主成分的马氏距离。 matplotlib.patches Ellipse 使用椭圆进行绘图。下面生成这些椭圆的数学是正确的还是有问题?
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from scipy.spatial.distance import mahalanobis
from scipy.stats import chi2
def pca_ci_out(df, ci=0.95, pci=5):
pc = pci
id_list = df.reset_index()['ID'].to_list()
df_array = df.to_numpy()
pca = PCA()
pcs_h = pca.fit_transform(df_array)
#calculate 95% confidence intervals for the mahalanobis distance from the centroid for outlier detection
pcs_hi = pcs_h[:, :pc]
mean = np.mean(pcs_hi, axis=0)
inv_cov = np.linalg.inv(np.cov(pcs_hi, rowvar=False))
dist = {id_list[i] : mahalanobis(pcs_hi[i, :], mean, inv_cov) for i in range(pcs_h.shape[0])}
#we detetermine outliers as values that are very far the centroid(p = 0.001| 0.999)
extreme_dist = np.sqrt(chi2.ppf(0.999, pc))
outliers = [[key, *list(pcs_h[id_list.index(key), :2]), value] for key, value in dist.items() if value >= extreme_dist]
# confidence interval is based on only on top two PC's since that is what we will plot
cov_mat = np.cov(pcs_h[:, :2], rowvar=False)
chi2_95ci = chi2.ppf(ci, 2)
eigenvalues, eigenvectors = np.linalg.eig(cov_mat)
major = 2 * np.sqrt(chi2_95ci * eigenvalues[0])
minor = 2 * np.sqrt(chi2_95ci * eigenvalues[1])
rotation = np.arctan(eigenvectors[0][1]/eigenvectors[0][0])
center = np.mean(pcs_h[:, :2], axis=0)
CI_ellipse = [tuple(center), major, minor, rotation]
return CI_ellipse, outliers