Hotelling在python中的T ^ 2得分

时间:2014-08-20 19:29:52

标签: python matplotlib statistics scipy pca

我在python中使用matplotlib在数据集上应用了pca。但是,matplotlib不提供像Matlab这样的t平方分数。有没有办法像Matlab一样计算Hotelling的T ^ 2得分?

感谢。

2 个答案:

答案 0 :(得分:3)

matplotlibPCA类不包括Hotelling T 2 计算,但只需几行代码即可完成。以下代码包括计算每个点的T 2 值的函数。 __main__脚本将PCA应用于Matlab's pca documentation中使用的相同示例,因此您可以验证函数是否生成与Matlab相同的值。

from __future__ import print_function, division

import numpy as np
from matplotlib.mlab import PCA


def hotelling_tsquared(pc):
    """`pc` should be the object returned by matplotlib.mlab.PCA()."""
    x = pc.a.T
    cov = pc.Wt.T.dot(np.diag(pc.s)).dot(pc.Wt) / (x.shape[1] - 1)
    w = np.linalg.solve(cov, x)
    t2 = (x * w).sum(axis=0)
    return t2


if __name__ == "__main__":

    hald_text = """Y       X1      X2      X3      X4
    78.5    7       26      6       60
    74.3    1       29      15      52
    104.3   11      56      8       20
    87.6    11      31      8       47
    95.9    7       52      6       33
    109.2   11      55      9       22
    102.7   3       71      17      6
    72.5    1       31      22      44
    93.1    2       54      18      22
    115.9   21      47      4       26
    83.8    1       40      23      34
    113.3   11      66      9       12
    109.4   10      68      8       12
    """
    hald = np.loadtxt(hald_text.splitlines(), skiprows=1)
    ingredients = hald[:, 1:]

    pc = PCA(ingredients, standardize=False)
    coeff = pc.Wt

    np.set_printoptions(precision=4)

    # For coeff and latent, compare to
    #     http://www.mathworks.com/help/stats/pca.html#btjpztu-1
    print("coeff:")
    print(coeff)
    print()

    latent = pc.s / (ingredients.shape[0] - 1)
    print("latent:" + (" %9.4f"*len(latent)) % tuple(latent))
    print()

    # For tsquared, compare to
    #     http://www.mathworks.com/help/stats/pca.html#bti6r0c-1
    tsquared = hotelling_tsquared(pc)
    print("tsquared:")
    print(tsquared)

输出:

coeff:
[[ 0.0678  0.6785 -0.029  -0.7309]
 [ 0.646   0.02   -0.7553  0.1085]
 [-0.5673  0.544  -0.4036  0.4684]
 [ 0.5062  0.4933  0.5156  0.4844]]

latent:  517.7969   67.4964   12.4054    0.2372

tsquared:
[ 5.6803  3.0758  6.0002  2.6198  3.3681  0.5668  3.4818  3.9794  2.6086
  7.4818  4.183   2.2327  2.7216]

答案 1 :(得分:0)

即使这是一个古老的问题,我仍在发布代码,因为它可能会对某人有所帮助。 这是代码,作为奖励,它可以一次执行多个酒店测试

import numpy as np
from scipy.stats import f as f_distrib


def hotelling_t2(X, Y):

# X and Y are 3D arrays
# dim 0: number of features
# dim 1: number of subjects
# dim 2: number of mesh nodes or voxels (numer of tests)

nx = X.shape[1]
ny = Y.shape[1]
p = X.shape[0]
Xbar = X.mean(1)
Ybar = Y.mean(1)
Xbar = Xbar.reshape(Xbar.shape[0], 1, Xbar.shape[1])
Ybar = Ybar.reshape(Ybar.shape[0], 1, Ybar.shape[1])

X_Xbar = X - Xbar
Y_Ybar = Y - Ybar
Wx = np.einsum('ijk,ljk->ilk', X_Xbar, X_Xbar)
Wy = np.einsum('ijk,ljk->ilk', Y_Ybar, Y_Ybar)
W = (Wx + Wy) / float(nx + ny - 2)
Xbar_minus_Ybar = Xbar - Ybar
x = np.linalg.solve(W.transpose(2, 0, 1),
Xbar_minus_Ybar.transpose(2, 0, 1))
x = x.transpose(1, 2, 0)

t2 = np.sum(Xbar_minus_Ybar * x, 0)
t2 = t2 * float(nx * ny) / float(nx + ny)
stat = (t2 * float(nx + ny - 1 - p) / (float(nx + ny - 2) * p))

pval = 1 - np.squeeze(f_distrib.cdf(stat, p, nx + ny - 1 - p))
return pval, t2