我使用prcomp在包含24个健康变量和其他社会经济变量的数据集上预先形成PCA。我的目的是获得一个健康指数,我可以用于我的回归分析。我使用以下代码:
total_pca <- prcomp(health[,-1], scale. = FALSE, rank. = 1)
data$pca <- total_pca$x
data$PVW <- ecdf(-data$pca)(-data$pca) # Convert into percentile
从一篇文章中可以看出:&#34;所有负载都是正数,这意味着 第一主成分的较大值表示较差的健康。然后将第一个主成分转换为单个百分位数,以便更高的值反映出更好的健康状况。因此,我们可以将估计的健康参数解释为由于健康指数的百分位数增加而导致的工作概率的变化。&#34;
我的问题是我得到正负载,所以我仍然可以使用与文章中相同的解释吗?或者我应该扭转消极的迹象?如果是这样,我该怎么做?
另一个问题是我的数据中有不同的波形,但我在某处读到了我不能将数据子集到不同的数据集中给定波形并为每个数据集分别进行PCA - 这是正确的吗?如果是这样,我该怎么办呢?
我的数据的可重现的例子是:
health <- structure(list(wave = c(1, 2, 4, 5, 1, 5, 5, 4, 4, 1, 1, 1, 4, 2, 4, 2, 4, 6, 2, 4, 5, 1, 4, 1, 1, 2, 1, 2, 5, 2, 2, 4, 2, 1, 4, 4, 4, 1, 4, 2), fairpoor = c(1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0), adl = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), mental = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0), heart = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), blood = c(1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0), stroke = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), diabetes = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lung = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0), arthritis = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1), cancer = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), backjoint = c(1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0), doctor = c(1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), hospital = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), nursinghome = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bmi = structure(c(32.1, 28.7, 24.7, 23.5, 25.1, 26.3, 22.8, 26.3, 17.2, 32.2, 21.2, 23.6, 28.3, 35.8, 28.3, 28.7, 28.1, 20.4, 23.7, 22.7, 20.4, 25.5, 29.7, 20.3, 20.8, 23.1, 23.3, 26.3, 34.2, 40.6, 24.9, 27.2, 26.4, 23.5, 32.1, 32.8, 26.0, 23.4, 23.7, 22.8), labels = structure(c(-3, -2, -1), .Names = c("Implausible/ suspected wrong", "Refusal", "Don't know")), class = "labelled"), walking = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), sitting = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), chair = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), stairs = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), kneeling = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), arm = c(0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0), pullpush = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lifting = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), coin = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("wave", "fairpoor", "adl", "mental", "heart", "blood", "stroke", "diabetes", "lung", "arthritis", "cancer", "backjoint", "doctor", "hospital", "nursinghome", "bmi", "walking", "sitting", "chair", "stairs", "kneeling", "arm", "pullpush", "lifting", "coin"), row.names = c(323L, 1847L, 3731L, 5973L, 244L, 5914L, 6289L, 3847L, 3804L, 75L, 247L, 212L, 3878L, 1858L, 3994L, 2046L, 3920L, 9459L, 1850L, 4000L, 6072L, 253L, 3826L, 148L, 319L, 1855L, 17L, 1849L, 5683L, 1791L, 2002L, 3744L, 2027L, 219L, 4052L, 3837L, 4008L, 127L, 3906L, 1880L), class = "data.frame")
答案 0 :(得分:0)
biplot 可让您更好地解释健康数据中的 PCA 。
在您的给定数据中,bmi
的数值大于所有其他变量,因此对于相同百分比变化的数据变化会有更多的贡献。
因此,始终建议进行缩放,以使每个变量具有相同的重要性。此外,您的示例数据似乎具有固定列(所有个体都为0),因此从分析中删除。
non_zero_counts <- apply(health, 2, function(x) sum(x != 0))
all_zero_col <- which(non_zero_counts == 0)
# removing columns with all zeroes from PCA
total_pca <- prcomp(health[,-c(1,all_zero_col)], scale. = TRUE)
主成分的系数可以让您了解主成分在分析中如何与原始变量相关联。
library(ggbiplot)
ggbiplot(total_pca) + xlim(-2,2) + ylim(-2,2)
如您所见,几乎所有原始变量(糖尿病,肺除外)与PC1呈负相关。如果高分的条件/原始变量表明 健康不良 那么较高的PC1分数与 健康状况相关