通过PCA检测变量并确定发生异常的原因(例如,通过:Hotelling T2)

时间:2018-12-24 16:11:07

标签: r loading pca anomaly-detection

PCA异常检测并在观察级别识别数据帧中的哪个变量真正导致了主成分1中的异常行为。示例显示在最后一张图像中的观察链接34中。 https://pubs.rsc.org/en/content/articlehtml/2014/ay/c3ay41907j#tab1

因此,我在酒数据上应用了PCA,其中数据分别由酒类型和酒的各种特征组成。一旦将pca应用于数据,便有了PC1 PC2和许多组件。当我们绘制PC1时,我们可以看到一个特定的观测值(例如:观测值34),这就是异常模式。因此,我们需要了解为什么哪个变量导致了此异常数据点。到现在为止,我已经计算了负荷,该负荷给出了变量的总体贡献,在这里我想分别为每一列的每个变量贡献

# Creating Wine data frame for PCA Anomaly Contribution plot
rm(list=ls())
Wine_Type <- c("ARG-BNS1",  "ARG-DDA1", "ARG-FFL1", "ARG-FLM1", "ARG-ICR1", "ARG-SAL1", "AUS-CAV1", "AUS-EAG1", "AUS-HAR1", "AUS-IB41", "AUS-KIL1", "AUS-KIR1", "AUS-NUG1", "AUS-SOC1", "AUS-TGH1", "AUS-VAF1", "AUS-WBL1", "AUS-WES1", "CHI-CDD1", "CHI-CDM1", "CHI-CMO1", "CHI-CSU1", "CHI-GNE1", "CHI-IND1", "CHI-LJO1", "CHI-S151", "CHI-SCH1", "CHI-SHE1", "CHI-SUN1", "CHI-UND1", "CHI-UTA1", "CHI-VDA1", "CHI-VDS1", 'SOU-HHI1', "SOU-INS1", "SOU-KWV1", 'SOU-NED1', 'SOU-PDM1', "SOU-ROO1", "SOU-RW21", "SOU-SAV1", "SOU-SIM1", "SOU-SPI1", "SOU-SRE1")

Ethanol <- c(13.6199999,    14.0600004, 13.7399998, 13.9499998, 14.4700003, 14.6099997, 13.6499996, 14.1199999, 13.1300001, 13.49,  15.0900002, 14.6300001, 13.6300001, 13.6700001, 14.4300003, 13.4499998, 13.8299999, 13.8500004, 13.9700003, 12.8400002, 14.1899996, 14.1300001, 13.6599998, 14.2700005, 13.8400002, 13.5799999, 13.2299995, 13.6099997, 13.7700005, 13.6199999, 13.5100002, 12.79,  14.6300001, 13.96,  14.0500002, 14.0200005, 13.9799995, 14.5,   13.8999996, 14.2200003, 13.8000002, 14.4499998, 14.3599997, 14.25)

Total_acid <- c(3.53999996, 3.74000001, 3.26999998, 3.66000009, 3.66000009, 3.45000005, 4.30999994, 3.88000011, 3.82999992, 3.69,   3.98000002, 4.78000021, 4.63999987, 3.86999989, 4.51000023, 4.34000015, 4.21999979, 4.15999985, 3.53999996, 3.22000003, 3.4000001,  3.61999989, 3.07999992, 3.43000007, 3.04999995, 3.24000001, 3.03999996, 3.07999992, 3.36999989, 3.49000001, 3.42000008, 2.99000001, 3.1400001,  4.55000019, 3.5,    3.88000011, 4.36000013, 4.42000008, 4.0999999,  4.05999994, 3.58999991, 3.83999991, 3.81999993, 3.56999993)
Volatile_acid<- c(0.289999992,  0.589999974,    0.469999999,    0.469999999,    0.379999995,    0.519999981,    0.319999993,    0.370000005,    0.270000011,    0.32,   0.469999999,    0.430000007,    0.709999979,    0.540000021,    0.400000006,    0.460000008,    0.330000013,    0.360000014,    0.289999992,    0.340000004,    0.349999994,    0.330000013,    0.280000001,    0.439999998,    0.25999999, 0.239999995,    0.340000004,    0.280000001,    0.300000012,    0.209999993,    0.319999993,    0.370000005,    0.439999998,    1.02999997, 0.379999995,    0.409999996,    0.430000007,    0.360000014,    0.649999976,    0.419999987,    0.349999994,    0.340000004,    0.319999993,    0.479999989)         
Malic_acid <- c(0.889999986,    0.239999995,    -0.07,  0.090000004,    0.610000014,    0.159999996,    0.180000007,    0.360000014,    0.400000006,    0.5,    0.439999998,    0.519999981,    0.189999998,    0.159999996,    0.310000002,    0.469999999,    0.49000001, 0.170000002,    0.479999989,    0.419999987,    0.460000008,    0.310000002,    0.419999987,    0.449999988,    0.469999999,    0.529999971,    0.419999987,    0.50999999, 0.319999993,    0.449999988,    0.540000021,    0.289999992,    0.379999995,    -0.620000005,   0.479999989,    0.449999988,    0.280000001,    0.529999971,    0.189999998,    0.280000001,    0.550000012,    0.709999979,    0.50999999, 0.389999986)
pH <- c(3.71000004, 3.73000002, 3.86999989, 3.78999996, 3.70000005, 3.92000008, 3.5999999,  3.70000005, 3.66000009, 3.7,    3.67000008, 3.52999997, 3.6400001,  3.6500001,  3.70000005, 3.48000002, 3.54999995, 3.53999996, 3.6400001,  3.71000004, 3.72000003, 3.6400001,  3.68000007, 3.75999999, 3.71000004, 3.61999989, 3.76999998, 3.6400001,  3.68000007, 3.57999992, 3.63000011, 3.70000005, 3.75,   3.88000011, 3.81999993, 3.68000007, 3.66000009, 3.61999989, 3.66000009, 3.63000011, 3.82999992, 3.75999999, 3.8499999,  3.70000005)                     
Latic_acid <- c(0.779999971,    1.25,   1.13,   1,  0.810000002,    1.75999999, 1.37,   1.01999998, 1.13,   1.02,   1.00999999, 0.839999974,    1.78999996, 1.22000003, 1.49000001, 1.16999996, 1.13,   0.959999979,    0.779999971,    1.19000006, 0.850000024,    0.819999993,    0.910000026,    0.790000021,    0.800000012,    0.829999983,    1.08000004, 0.939999998,    0.930000007,    0.800000012,    0.839999974,    1.26999998, 0.939999998,    2.95000005, 1.13999999, 1.00999999, 1.44000006, 0.980000019,    1.17999995, 1.25,   1.13999999, 1.00999999, 1.02999997, 0.949999988)            
Rest_sugar <- c(1.46000004, 2.42000008, 1.51999998, 4.17000008, 1.25,   1.39999998, 3.79999995, 4.32000017, 3.99000001, 6.4,    1.05999994, 1.20000005, 1.45000005, 0.620000005,    5.61999989, 1.27999997, 1.19000006, 2.58999991, 1.14999998, 1.37,   1.75,   1.77999997, 4.23999977, 1.51999998, 2.07999992, 2.45000005, 1.05999994, 4.0999999,  2.74000001, 1.37,   1.14999998, 1.15999997, 2.19000006, 2.3599999,  1.60000002, 1.45000005, 1.01999998, 0.75999999, 1.35000002, 1.53999996, 2.27999997, 2.11999989, 2.68000007, 1.92999995)
Citric_acid <- c(0.310000002,   0.180000007,    0.389999986,    0.409999996,    0.140000001,    0.100000001,    0.239999995,    0.319999993,    0.340000004,    0.13,   -0.039999999,   -0.050000001,   0.159999996,    0.370000005,    0.430000007,    -0.01,  0.090000004,    0.200000003,    0.119999997,    0.119999997,    0.170000002,    0.25999999, 0.100000001,    0.059999999,    0.209999993,    0.25999999, 0.209999993,    0.059999999,    0.300000012,    0.25999999, 0.090000004,    0.200000003,    0.200000003,    0.25,   -0.039999999,   0.07,   0.059999999,    0.159999996,    0.079999998,    0.090000004,    0.079999998,    0.200000003,    -0.02,  0.07)            
Co2 <- c(85.6100006,    175.199997, 513.73999,  379.399994, 154.880005, 156.300003, 462.619995, 244.149994, 212,    419.38, 48.0200005, 154.820007, 243.960007, 563.400024, 347.880005, 263.459991, 288.970001, 272.190002, 210.529999, 338.869995, 245.070007, 183.699997, 353.529999, 247.289993, 399.070007, 475.890015, 603.320007, 400.470001, 180.139999, 495.329987, 388.079987, 390.149994, 228.389999, 282.73999,  510.079987, 243.580002, 452.959991, 184.940002, 183.059998, 246.339996, 297.540009, 104.269997, 510.089996, 260.079987)                   
Density <- c(0.99000001,    1,  0.99000001, 1,  0.99000001, 0.99000001, 1,  1,  1,  1,  0.99000001, 1,  1,  0.99000001, 1,  0.99000001, 0.99000001, 1,  0.99000001, 1,  0.99000001, 0.99000001, 1,  0.99000001, 0.99000001, 0.99000001, 0.99000001, 1,  0.99000001, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 1,  0.99000001, 0.99000001, 0.99000001, 0.99000001, 0.99000001, 1,  1,  1,  0.99000001)
Total.polyphenol.index <- c(60.9199982, 70.6399994, 63.5900002, 73.3000031, 71.6900024, 71.7900009, 59.5999985, 59.5,   59.4199982, 63.86,  70.0999985, 72.3700027, 55.0699997, 63.0400009, 63.5200005, 62.6899986, 59.0800018, 83.5100021, 64.3099976, 53.0999985, 66.8199997, 64.8300018, 52.1599998, 63.75,  56.5499992, 53.1300011, 54.8899994, 54.2599983, 64.8000031, 58.8300018, 58.8100014, 50.4399986, 61.9399986, 44.6800003, 62.5800018, 60.8699989, 62.3499985, 60.1300011, 58.4700012, 57.0800018, 60.3600006, 68.6800003, 67.7699966, 57.8600006) 
Glycerol <- c(9.72000027,   10.0500002, 10.9200001, 9.68999958, 10.8100004, 10.1899996, 10.6599998, 11.0699997, 8.89000034, 10.35,  11.4300003, 11.6400003, 9.59000015, 11.2799997, 10.9300003, 9.46000004, 11.1000004, 10.4499998, 10.5799999, 8.80000019, 10.1099997, 9.85000038, 9.53999996, 9.93000031, 9.47999954, 9.32999992, 9.02000046, 9.38000011, 10.3299999, 9.86999989, 9.76000023, 8.11999989, 10.0500002, 8.22999954, 10.1000004, 10.6099997, 10.6199999, 12.5200005, 11.7200003, 10.2399998, 10.4700003, 11.1800003, 10.5799999, 9.93999958)              
Methanol <- c(0.159999996,  0.200000003,    0.180000007,    0.230000004,    0.200000003,    0.189999998,    0.25,   0.25,   0.230000004,    0.26,   0.189999998,    0.280000001,    0.25,   0.140000001,    0.300000012,    0.180000007,    0.219999999,    0.239999995,    0.180000007,    0.170000002,    0.180000007,    0.219999999,    0.180000007,    0.209999993,    0.189999998,    0.150000006,    0.150000006,    0.170000002,    0.170000002,    0.189999998,    0.180000007,    0.129999995,    0.150000006,    0.170000002,    0.25,   0.200000003,    0.219999999,    0.219999999,    0.180000007,    0.219999999,    0.239999995,    0.209999993,    0.270000011,    0.200000003)
Tartaric.acid <- c(1.74000001,  1.58000004, 1.24000001, 2.25999999, 1.22000003, 0.899999976,    1.80999994, 1.64999998, 2.11999989, 1.81,   1.47000003, 2.11999989, 1.36000001, 1.00999999, 1.80999994, 2.13000011, 1.54999995, 2.47000003, 1.72000003, 1.85000002, 1.48000002, 1.83000004, 1.38,   1.48000002, 1.65999997, 1.63,   1.55999994, 1.41999996, 1.62,   1.77999997, 1.29999995, 1.46000004, 1.30999994, 0.910000026,    1.22000003, 1.30999994, 1.76999998, 1.55999994, 1.40999997, 1.5,    1.70000005, 1.52999997, 2.01999998, 1.42999995)
country <- c("Argentina","Argentina","Argentina","Argentina","Argentina","Argentina","Australia",   "Australia","Australia","Australia","Australia","Australia","Australia","Australia","Australia",    "Australia","Australia","Australia","Chile","Chile","Chile","Chile","Chile","Chile","Chile","Chile",    "Chile","Chile","Chile","Chile","Chile","Chile","Chile","South_Africa","South_Africa","South_Africa",   "South_Africa","South_Africa","South_Africa","South_Africa","South_Africa","South_Africa",  "South_Africa","South_Africa")
wdata <- data.frame(Wine_Type,Ethanol,Total_acid,Volatile_acid,Malic_acid,pH,Latic_acid,Rest_sugar,Citric_acid,Co2,Density,Total.polyphenol.index,Glycerol,Methanol,Tartaric.acid,country)
wdata
wdata$country <- as.factor(wdata$country)
df_index <- wdata[,c("Wine_Type","country")] # removing column which are not for PCA
dx <- wdata[,!names(wdata) %in% names(df_index)]
rownames(dx) <- df_index$Wine_Type  

# Normalising data for PCA Analysis
scaleContinuous = function(data) {
 data <- data[,sapply(data, function(x) is.numeric(x))]
 data <- as.data.frame(data)

  binary = apply(data, 2, function(x) {all(x %in% 0:1)}) 
  data[!binary] = scale(data[!binary])
  return(data)
}
dx <- as.data.frame(dx)
dx <- scaleContinuous(dx)   

Data set View

# Performing PCA 
pca_df <- princomp(dx, cor = T) 
summary(pca_df)  

# Getting Score & Loading from PCA Attributes
score_df <- as.data.frame(pca_df$scores)
loading_df <- as.matrix(pca_df$loadings) 

可视化PC1和PC2以及为PC1和PC2加载

plot(as.ts(score_df$Comp.1),main="First Principal Component", type = "l", col = "brown", lwd = 3)

PC1 Line plot

plot(as.ts(score_df$Comp.2),main="Second Principal Component", type = "l", col = "red", lwd = 3)

PC2 Line plot

my_vector1=loading_df[,1]
my_vector2=loading_df[,2]
names(my_vector1)= rownames(loading_df)
names(my_vector2)= rownames(loading_df)
library(RColorBrewer)
coul = brewer.pal(9, "Paired")  

PC1和PC2的贡献图。

 #   par(mfrow=c(2,1))
    a1=barplot(my_vector1, col=coul , las=1, names.arg="",main = paste("Weight/loading For PC1"))
    text(a1[,1], 0.2 ,srt = 90, adj= 1, xpd = TRUE, labels = names(dx) , cex=1.0)  

Contribution Plot for PC1

a2 = barplot(my_vector2,col = coul,las = 1,names.arg =“”,main = paste(“ PC2的重量/负载”)) text(a2 [,1],0.2,srt = 90,adj = 1,xpd = TRUE,标签= names(dx),cex = 1.0)
Contribution Plot For PC2

因此,预期输出是在每次观察时计算PC1中识别出的异常的每个变量的贡献。作为参考,随附了使用Hotelling T2进行标识的链接。https://pubs.rsc.org/en/content/articlehtml/2014/ay/c3ay41907j#tab1

Expected Output for contribution plot

因此可以预期的是:如何使用最后一个图像中用红色标记突出显示的公式为PC1线图所示的观察号34计算霍特林T2得分或贡献图,并分别附上更多参考链接。

Basically how to compute the below Equation shown in image

Equation to compute for Observation level

0 个答案:

没有答案