Question

我有一个庞大的遗传数据矩阵（1e7行代表个体x 5,000列代表标记），我想在其上执行PCA以保持c。 20列。但是，由于内存问题，我不能在8GB 64位计算机上使用R 3.1.2上的dudi.pca或big.PCA执行PCA。

另一种方法是计算矩阵的行子集上主轴坐标的近似值，然后使用与近似PA坐标的线性组合来变换整个矩阵。

我正在使用dudi.pca面对一个简单的PCA相关问题：如何使用原始矩阵和列坐标矩阵（=主轴）获取行坐标？

这是一个简单的例子，让我们采用随机矩阵M（3行和4列），例如：

 M= 
1 9 10 13
20 13 20 7
18 19 17 10

执行dudi.pca（M，center = T，scale = T）并且只保留一台PC，dudi.pca输出以下$ c1矩阵（列标准分数即主轴）：

c1 =
-0.547
-0.395
-0.539
0.504

要计算第一个主轴上数据的行坐标，我想要做内部产品：

r =
-0.547*1 + -0.395*9 + -0.539*10 + -0.504*13
-0.547*20 + -0.395*13 + -0.539*20 + -0.504*17
-0.547*18 + -0.395*19 + -0.539*17 + -0.504*10

即

r =
-2.944
-23.331
-21.481

但如果我查看dudi.pca在同一数据集上原生计算的$ li（行坐标，即主成分），我读到：

r' =
2.565
-1.559
-1.005

使用dudi.pca $ ci矩阵制定行坐标时我做错了什么？

非常感谢你的帮助，

Quaerens。

代码：

> M=matrix(c(1,9,10,13,20,13,20,7,18,19,17,10), ncol=4, byrow=T)
> M
     [,1] [,2] [,3] [,4]
[1,]    1    9   10   13
[2,]   20   13   20    7
[3,]   18   19   17   10
> N=dudi.pca(M, center=T, scale=T, scannf=F, nf=1)
> N$c1
          CS1
V1 -0.5468634
V2 -0.3955638
V3 -0.5389504
V4  0.5039863
> r=c( M[1,] %*% N$c1[,1], M[2,] %*% N$c1[,1], M[3,] %*% N$c1[,1] )
> r
[1]  -2.94462 -23.33070 -21.48155
> N$li
      Axis1
1  2.565165
2 -1.559546
3 -1.005619

Answer 1

如果仍有兴趣...

ADE4在二元图上工作，因此当p大于n时，奇异值分解在nxn对称矩阵上进行

library(ade4)
M=matrix(c(1,9,10,13,20,13,20,7,18,19,17,10), ncol=4, byrow=T)
M
##      [,1] [,2] [,3] [,4]
## [1,]    1    9   10   13
## [2,]   20   13   20    7
## [3,]   18   19   17   10
N=dudi.pca(M, center=T, scale=T, scannf=F, nf=1)

#dimensions of M
n=3
p=4

X=scalewt(M,center=T,scale=T)

#this could be done in two ways. Singular Value Decomposition or Duality    Diagrams.
#Consider a Singular value decomposition of X; S=UDV; where S is X, U is the left triangular matrix, and V is the right triangular matrix, and D is the diagonal matrix of eigen values

svd=svd(X)

#These are equivalent
N$c1
svd$v[,1]

#Equivalent
N$eig
## [1] 3.341175 0.658825
svd$d[1:2]
## [1] 3.341175 0.658825

#Diagonal matrix of eigen values
lambda=diag(svd$d)

#N$lw gives the row weights
N$lw
#0.3333333 0.3333333 0.3333333

#find the inverse of the diagonal matrix of row weights; this is the normalization part
K=solve(sqrt(diag(N$lw,n)))%*%svd$u


#These are equivalent
head(K[,1])
## [1]  1.4033490 -0.8531958 -0.5501532
head(N$l1)
##          RS1
## 1  1.4033490
## 2 -0.8531958
## 3 -0.5501532

#Find Principal Components
pc=K%*%sqrt(lambda)
#These are equivalent

head(pc)
##           [,1]       [,2]
## [1,]  2.565165 -0.1420130
## [2,] -1.559546 -0.9154578
## [3,] -1.005619  1.0574707
head(N$li)
##       Axis1
## 1  2.565165
## 2 -1.559546
## 3 -1.005619

这也可以使用ade4中实现的对偶图来完成在这里查看ade4中实现的二元图的参考：http://projecteuclid.org/euclid.aoas/1324399594

Q<-diag(p)
D<-diag(1/n, n)
rk<-qr(X)
rank=rk$rank

#Statistical Triplets
V<-t(X)%*%D%*%X 
W<-X%*%Q%*%t(X)

#Compute the eigen values and vectors of the statistical triplet
example.eigen=eigen(W%*%D)

#Equivalent
N$eig
## [1] 3.341175 0.658825
example.eigen$values[1:rank]
## [1] 3.341175 0.658825

#Diagonal matrix of eigen values
lambda=diag(example.eigen$values[1:rank])


#find the inverse of the diagonal matrix of row weights; this is the normalizing part
Binv<-solve(sqrt(D))

K=Binv%*%example.eigen$vectors[,1:rank]

#These are equivalent
head(K[,1])
## [1]  1.4033490 -0.8531958 -0.5501532
head(N$l1)
##          RS1
## 1  1.4033490
## 2 -0.8531958
## 3 -0.5501532

#Find Principal Components
pc=K%*%sqrt(lambda)
#These are equivalent

head(pc)
##           [,1]       [,2]
## [1,]  2.565165 -0.1420130
## [2,] -1.559546 -0.9154578
## [3,] -1.005619  1.0574707
head(N$li)
##       Axis1
## 1  2.565165
## 2 -1.559546
## 3 -1.005619

使用dudi.pca转换PCA上下文中的行

1 个答案: