PCA异常检测并在观察级别识别数据帧中的哪个变量真正导致了主成分1中的异常行为。示例显示在最后一张图像中的观察链接34中。 https://pubs.rsc.org/en/content/articlehtml/2014/ay/c3ay41907j#tab1。
因此,我在酒数据上应用了PCA,其中数据分别由酒类型和酒的各种特征组成。一旦将pca应用于数据,便有了PC1 PC2和许多组件。当我们绘制PC1时,我们可以看到一个特定的观测值(例如:观测值34),这就是异常模式。因此,我们需要了解为什么哪个变量导致了此异常数据点。到现在为止,我已经计算了负荷,该负荷给出了变量的总体贡献,在这里我想分别为每一列的每个变量贡献
# Creating Wine data frame for PCA Anomaly Contribution plot
rm(list=ls())
Wine_Type <- c("ARG-BNS1", "ARG-DDA1", "ARG-FFL1", "ARG-FLM1", "ARG-ICR1", "ARG-SAL1", "AUS-CAV1", "AUS-EAG1", "AUS-HAR1", "AUS-IB41", "AUS-KIL1", "AUS-KIR1", "AUS-NUG1", "AUS-SOC1", "AUS-TGH1", "AUS-VAF1", "AUS-WBL1", "AUS-WES1", "CHI-CDD1", "CHI-CDM1", "CHI-CMO1", "CHI-CSU1", "CHI-GNE1", "CHI-IND1", "CHI-LJO1", "CHI-S151", "CHI-SCH1", "CHI-SHE1", "CHI-SUN1", "CHI-UND1", "CHI-UTA1", "CHI-VDA1", "CHI-VDS1", 'SOU-HHI1', "SOU-INS1", "SOU-KWV1", 'SOU-NED1', 'SOU-PDM1', "SOU-ROO1", "SOU-RW21", "SOU-SAV1", "SOU-SIM1", "SOU-SPI1", "SOU-SRE1")
Ethanol <- c(13.6199999, 14.0600004, 13.7399998, 13.9499998, 14.4700003, 14.6099997, 13.6499996, 14.1199999, 13.1300001, 13.49, 15.0900002, 14.6300001, 13.6300001, 13.6700001, 14.4300003, 13.4499998, 13.8299999, 13.8500004, 13.9700003, 12.8400002, 14.1899996, 14.1300001, 13.6599998, 14.2700005, 13.8400002, 13.5799999, 13.2299995, 13.6099997, 13.7700005, 13.6199999, 13.5100002, 12.79, 14.6300001, 13.96, 14.0500002, 14.0200005, 13.9799995, 14.5, 13.8999996, 14.2200003, 13.8000002, 14.4499998, 14.3599997, 14.25)
Total_acid <- c(3.53999996, 3.74000001, 3.26999998, 3.66000009, 3.66000009, 3.45000005, 4.30999994, 3.88000011, 3.82999992, 3.69, 3.98000002, 4.78000021, 4.63999987, 3.86999989, 4.51000023, 4.34000015, 4.21999979, 4.15999985, 3.53999996, 3.22000003, 3.4000001, 3.61999989, 3.07999992, 3.43000007, 3.04999995, 3.24000001, 3.03999996, 3.07999992, 3.36999989, 3.49000001, 3.42000008, 2.99000001, 3.1400001, 4.55000019, 3.5, 3.88000011, 4.36000013, 4.42000008, 4.0999999, 4.05999994, 3.58999991, 3.83999991, 3.81999993, 3.56999993)
Volatile_acid<- c(0.289999992, 0.589999974, 0.469999999, 0.469999999, 0.379999995, 0.519999981, 0.319999993, 0.370000005, 0.270000011, 0.32, 0.469999999, 0.430000007, 0.709999979, 0.540000021, 0.400000006, 0.460000008, 0.330000013, 0.360000014, 0.289999992, 0.340000004, 0.349999994, 0.330000013, 0.280000001, 0.439999998, 0.25999999, 0.239999995, 0.340000004, 0.280000001, 0.300000012, 0.209999993, 0.319999993, 0.370000005, 0.439999998, 1.02999997, 0.379999995, 0.409999996, 0.430000007, 0.360000014, 0.649999976, 0.419999987, 0.349999994, 0.340000004, 0.319999993, 0.479999989)
Malic_acid <- c(0.889999986, 0.239999995, -0.07, 0.090000004, 0.610000014, 0.159999996, 0.180000007, 0.360000014, 0.400000006, 0.5, 0.439999998, 0.519999981, 0.189999998, 0.159999996, 0.310000002, 0.469999999, 0.49000001, 0.170000002, 0.479999989, 0.419999987, 0.460000008, 0.310000002, 0.419999987, 0.449999988, 0.469999999, 0.529999971, 0.419999987, 0.50999999, 0.319999993, 0.449999988, 0.540000021, 0.289999992, 0.379999995, -0.620000005, 0.479999989, 0.449999988, 0.280000001, 0.529999971, 0.189999998, 0.280000001, 0.550000012, 0.709999979, 0.50999999, 0.389999986)
pH <- c(3.71000004, 3.73000002, 3.86999989, 3.78999996, 3.70000005, 3.92000008, 3.5999999, 3.70000005, 3.66000009, 3.7, 3.67000008, 3.52999997, 3.6400001, 3.6500001, 3.70000005, 3.48000002, 3.54999995, 3.53999996, 3.6400001, 3.71000004, 3.72000003, 3.6400001, 3.68000007, 3.75999999, 3.71000004, 3.61999989, 3.76999998, 3.6400001, 3.68000007, 3.57999992, 3.63000011, 3.70000005, 3.75, 3.88000011, 3.81999993, 3.68000007, 3.66000009, 3.61999989, 3.66000009, 3.63000011, 3.82999992, 3.75999999, 3.8499999, 3.70000005)
Latic_acid <- c(0.779999971, 1.25, 1.13, 1, 0.810000002, 1.75999999, 1.37, 1.01999998, 1.13, 1.02, 1.00999999, 0.839999974, 1.78999996, 1.22000003, 1.49000001, 1.16999996, 1.13, 0.959999979, 0.779999971, 1.19000006, 0.850000024, 0.819999993, 0.910000026, 0.790000021, 0.800000012, 0.829999983, 1.08000004, 0.939999998, 0.930000007, 0.800000012, 0.839999974, 1.26999998, 0.939999998, 2.95000005, 1.13999999, 1.00999999, 1.44000006, 0.980000019, 1.17999995, 1.25, 1.13999999, 1.00999999, 1.02999997, 0.949999988)
Rest_sugar <- c(1.46000004, 2.42000008, 1.51999998, 4.17000008, 1.25, 1.39999998, 3.79999995, 4.32000017, 3.99000001, 6.4, 1.05999994, 1.20000005, 1.45000005, 0.620000005, 5.61999989, 1.27999997, 1.19000006, 2.58999991, 1.14999998, 1.37, 1.75, 1.77999997, 4.23999977, 1.51999998, 2.07999992, 2.45000005, 1.05999994, 4.0999999, 2.74000001, 1.37, 1.14999998, 1.15999997, 2.19000006, 2.3599999, 1.60000002, 1.45000005, 1.01999998, 0.75999999, 1.35000002, 1.53999996, 2.27999997, 2.11999989, 2.68000007, 1.92999995)
Citric_acid <- c(0.310000002, 0.180000007, 0.389999986, 0.409999996, 0.140000001, 0.100000001, 0.239999995, 0.319999993, 0.340000004, 0.13, -0.039999999, -0.050000001, 0.159999996, 0.370000005, 0.430000007, -0.01, 0.090000004, 0.200000003, 0.119999997, 0.119999997, 0.170000002, 0.25999999, 0.100000001, 0.059999999, 0.209999993, 0.25999999, 0.209999993, 0.059999999, 0.300000012, 0.25999999, 0.090000004, 0.200000003, 0.200000003, 0.25, -0.039999999, 0.07, 0.059999999, 0.159999996, 0.079999998, 0.090000004, 0.079999998, 0.200000003, -0.02, 0.07)
Co2 <- c(85.6100006, 175.199997, 513.73999, 379.399994, 154.880005, 156.300003, 462.619995, 244.149994, 212, 419.38, 48.0200005, 154.820007, 243.960007, 563.400024, 347.880005, 263.459991, 288.970001, 272.190002, 210.529999, 338.869995, 245.070007, 183.699997, 353.529999, 247.289993, 399.070007, 475.890015, 603.320007, 400.470001, 180.139999, 495.329987, 388.079987, 390.149994, 228.389999, 282.73999, 510.079987, 243.580002, 452.959991, 184.940002, 183.059998, 246.339996, 297.540009, 104.269997, 510.089996, 260.079987)
Density <- c(0.99000001, 1, 0.99000001, 1, 0.99000001, 0.99000001, 1, 1, 1, 1, 0.99000001, 1, 1, 0.99000001, 1, 0.99000001, 0.99000001, 1, 0.99000001, 1, 0.99000001, 0.99000001, 1, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 1, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 1, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 1, 1, 1, 0.99000001)
Total.polyphenol.index <- c(60.9199982, 70.6399994, 63.5900002, 73.3000031, 71.6900024, 71.7900009, 59.5999985, 59.5, 59.4199982, 63.86, 70.0999985, 72.3700027, 55.0699997, 63.0400009, 63.5200005, 62.6899986, 59.0800018, 83.5100021, 64.3099976, 53.0999985, 66.8199997, 64.8300018, 52.1599998, 63.75, 56.5499992, 53.1300011, 54.8899994, 54.2599983, 64.8000031, 58.8300018, 58.8100014, 50.4399986, 61.9399986, 44.6800003, 62.5800018, 60.8699989, 62.3499985, 60.1300011, 58.4700012, 57.0800018, 60.3600006, 68.6800003, 67.7699966, 57.8600006)
Glycerol <- c(9.72000027, 10.0500002, 10.9200001, 9.68999958, 10.8100004, 10.1899996, 10.6599998, 11.0699997, 8.89000034, 10.35, 11.4300003, 11.6400003, 9.59000015, 11.2799997, 10.9300003, 9.46000004, 11.1000004, 10.4499998, 10.5799999, 8.80000019, 10.1099997, 9.85000038, 9.53999996, 9.93000031, 9.47999954, 9.32999992, 9.02000046, 9.38000011, 10.3299999, 9.86999989, 9.76000023, 8.11999989, 10.0500002, 8.22999954, 10.1000004, 10.6099997, 10.6199999, 12.5200005, 11.7200003, 10.2399998, 10.4700003, 11.1800003, 10.5799999, 9.93999958)
Methanol <- c(0.159999996, 0.200000003, 0.180000007, 0.230000004, 0.200000003, 0.189999998, 0.25, 0.25, 0.230000004, 0.26, 0.189999998, 0.280000001, 0.25, 0.140000001, 0.300000012, 0.180000007, 0.219999999, 0.239999995, 0.180000007, 0.170000002, 0.180000007, 0.219999999, 0.180000007, 0.209999993, 0.189999998, 0.150000006, 0.150000006, 0.170000002, 0.170000002, 0.189999998, 0.180000007, 0.129999995, 0.150000006, 0.170000002, 0.25, 0.200000003, 0.219999999, 0.219999999, 0.180000007, 0.219999999, 0.239999995, 0.209999993, 0.270000011, 0.200000003)
Tartaric.acid <- c(1.74000001, 1.58000004, 1.24000001, 2.25999999, 1.22000003, 0.899999976, 1.80999994, 1.64999998, 2.11999989, 1.81, 1.47000003, 2.11999989, 1.36000001, 1.00999999, 1.80999994, 2.13000011, 1.54999995, 2.47000003, 1.72000003, 1.85000002, 1.48000002, 1.83000004, 1.38, 1.48000002, 1.65999997, 1.63, 1.55999994, 1.41999996, 1.62, 1.77999997, 1.29999995, 1.46000004, 1.30999994, 0.910000026, 1.22000003, 1.30999994, 1.76999998, 1.55999994, 1.40999997, 1.5, 1.70000005, 1.52999997, 2.01999998, 1.42999995)
country <- c("Argentina","Argentina","Argentina","Argentina","Argentina","Argentina","Australia", "Australia","Australia","Australia","Australia","Australia","Australia","Australia","Australia", "Australia","Australia","Australia","Chile","Chile","Chile","Chile","Chile","Chile","Chile","Chile", "Chile","Chile","Chile","Chile","Chile","Chile","Chile","South_Africa","South_Africa","South_Africa", "South_Africa","South_Africa","South_Africa","South_Africa","South_Africa","South_Africa", "South_Africa","South_Africa")
wdata <- data.frame(Wine_Type,Ethanol,Total_acid,Volatile_acid,Malic_acid,pH,Latic_acid,Rest_sugar,Citric_acid,Co2,Density,Total.polyphenol.index,Glycerol,Methanol,Tartaric.acid,country)
wdata
wdata$country <- as.factor(wdata$country)
df_index <- wdata[,c("Wine_Type","country")] # removing column which are not for PCA
dx <- wdata[,!names(wdata) %in% names(df_index)]
rownames(dx) <- df_index$Wine_Type
# Normalising data for PCA Analysis
scaleContinuous = function(data) {
data <- data[,sapply(data, function(x) is.numeric(x))]
data <- as.data.frame(data)
binary = apply(data, 2, function(x) {all(x %in% 0:1)})
data[!binary] = scale(data[!binary])
return(data)
}
dx <- as.data.frame(dx)
dx <- scaleContinuous(dx)
# Performing PCA
pca_df <- princomp(dx, cor = T)
summary(pca_df)
# Getting Score & Loading from PCA Attributes
score_df <- as.data.frame(pca_df$scores)
loading_df <- as.matrix(pca_df$loadings)
plot(as.ts(score_df$Comp.1),main="First Principal Component", type = "l", col = "brown", lwd = 3)
plot(as.ts(score_df$Comp.2),main="Second Principal Component", type = "l", col = "red", lwd = 3)
my_vector1=loading_df[,1]
my_vector2=loading_df[,2]
names(my_vector1)= rownames(loading_df)
names(my_vector2)= rownames(loading_df)
library(RColorBrewer)
coul = brewer.pal(9, "Paired")
# par(mfrow=c(2,1))
a1=barplot(my_vector1, col=coul , las=1, names.arg="",main = paste("Weight/loading For PC1"))
text(a1[,1], 0.2 ,srt = 90, adj= 1, xpd = TRUE, labels = names(dx) , cex=1.0)
a2 = barplot(my_vector2,col = coul,las = 1,names.arg =“”,main = paste(“ PC2的重量/负载”))
text(a2 [,1],0.2,srt = 90,adj = 1,xpd = TRUE,标签= names(dx),cex = 1.0)
因此,预期输出是在每次观察时计算PC1中识别出的异常的每个变量的贡献。作为参考,随附了使用Hotelling T2进行标识的链接。https://pubs.rsc.org/en/content/articlehtml/2014/ay/c3ay41907j#tab1
因此可以预期的是:如何使用最后一个图像中用红色标记突出显示的公式为PC1线图所示的观察号34计算霍特林T2得分或贡献图,并分别附上更多参考链接。
Basically how to compute the below Equation shown in image