我正在使用R中的插入符号包进行PLSDA。我的目标是使用牛奶中红外光谱预测奶牛的状况(0比1)。我想比较系数以了解哪些光谱点对预测的贡献最大。如果使用插入符号中的“ peProc”选项对数据进行居中和缩放(标准化),系数是标准化的还是非标准化的?我应该使用标准化或非标准化系数来识别重要变量吗?
这是我在R中的代码:
# sample of data: 10 rows and 11 columns (cow status and 10 spectral points)
data<-structure(list(status = c(1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L,
1L, 1L), X67 = c(0.0621632561087608, 0.0607260726392266, 0.0625290125608445,
0.0630319677293305, 0.0647925734519963, 0.0652132406830785, 0.0642152130603786,
0.0632935389876363, 0.0630211532115932, 0.0653552338480952),
X69 = c(0.0412880629301071, 0.0411925278604031, 0.0422371216118336,
0.0428165234625339, 0.0449542962014675, 0.0450734049081802,
0.0436325967311859, 0.0429527163505555, 0.0416706018149853,
0.0443116500973701), X80 = c(-0.0138179995119572, -0.0144830904901028,
-0.0144161432981491, -0.013074841350317, -0.0167389884591103,
-0.0159232392907143, -0.0143161900341511, -0.0138954184949399,
-0.0147733353078366, -0.0134175941348076), X81 = c(-0.0134893320500851,
-0.0142031051218509, -0.0142825171351433, -0.0127705596387386,
-0.0168376825749874, -0.0157066956162453, -0.0140237100422382,
-0.0135413259267807, -0.0144432000815868, -0.0128327533602714
), X82 = c(-0.0121541880071164, -0.0126882530748845, -0.0128774531185627,
-0.0113642327487468, -0.0153397060930729, -0.0141194649040699,
-0.012583240866661, -0.0119865834712983, -0.0129532031714916,
-0.0113681443035603), X83 = c(-0.0100522302091122, -0.0100194588303566,
-0.0103428065776825, -0.0090412348508835, -0.012253813445568,
-0.0112871341407299, -0.01005644723773, -0.00953329727053641,
-0.0103608369827271, -0.0091012343764305), X84 = c(-0.0068778395652771,
-0.006202656775713, -0.0066996179521084, -0.0058559291064739,
-0.0077349841594696, -0.0070663541555404, -0.00633592158555989,
-0.0060042813420295, -0.0066741779446602, -0.0058899037539959
), X85 = c(-0.0028718337416649, -0.0016290470957756, -0.0022227615118026,
-0.0018559470772744, -0.00229159742593769, -0.00200331956148141,
-0.0016555078327656, -0.0015552900731563, -0.0021986812353134,
-0.00184135138988499), X86 = c(0.001129399985075, 0.0027880594134331,
0.002091933041811, 0.0021506287157535, 0.0029507651925087,
0.00285872071981429, 0.00294967368245119, 0.0027530193328858,
0.0021884441375732, 0.0023270919919014), X87 = c(0.0043732412159443,
0.0061031468212605, 0.0052727945148945, 0.0052438415586949,
0.00681021064519879, 0.0065649412572384, 0.0064241252839565,
0.00608809292316441, 0.0055649057030678, 0.0056490488350391
)), row.names = c(NA, 10L), class = "data.frame")
Ycalib<-factor(data[,1],levels=c("1","0"),labels=c("status_1","status_2"))
names(Ycalib)<-c("y")
Xcalib<-data.frame(data[,2:11])
set.seed(1001)
folds<-createFolds(Ycalib,k=10,list = TRUE, returnTrain = TRUE)
set.seed(1001)
ctrl<-trainControl(method="repeatedcv",index=folds,classProbs = TRUE,summaryFunction = twoClassSummary,savePredictions = TRUE)
set.seed(1001)
plsda<-train(x=Xcalib, # spectral data
y=Ycalib, # factor vector
method="pls", # pls-da algorithm
tuneLength=20, # number of components
trControl=ctrl, # ctrl contained cross-validation option
preProc=c("center","scale"), # the data are centered and scaled
metric="ROC") # metric is ROC for 2 classes
coefficients<-coef(plsda$finalModel)