我真的可以使用一些帮助。
我正在尝试使用交叉验证技术来找到最佳模型。我使用了该网站的参考代码。
https://github.com/asadoughi/stat-learning/blob/master/ch6/lab.R
请参见第63行。
我对另一个数据集使用了相同的代码,并且一切正常。当我使用这个新数据集时,出现错误
Error in plot.window(...) : need finite 'ylim' values
当我尝试plot(mean.cv.errors)
时出现错误。我看到问题出在绘图功能之前。平均简历错误没有被计算。对于不同的预测变量,我得到的平均简历错误为“ NAN”
1 2 3 4 5 6 7 8
NaN NaN NaN NaN NaN NaN NaN NaN
任何人都有线索吗?我从数据中删除了NA。而且我对可能发生的事情完全不知所措,因为相同的精确代码可用于另一个数据集。
这是数据的结构
structure(list(tricepind = c(-0.174723355, -0.012222222, -0.197554452,
-0.042844901, -0.288806432, -0.340831629, -0.07727797, -0.016715831,
0.032448378, 0.223333333, -0.234205488, 0.152073733, 0.1, 0.066666667,
-0.09843684), mkcal = c(1451.990902, 1820.887165, 2025.580287,
1522.201067, 1296.587413, 936.4362557, 2626.190579, 1257.284695,
1583.382929, 1736.695, 1964.600102, 3557.202041, 1682.712691,
2025.962999, 2286.300483), mprot = c(82.15660833, 79.896551,
70.76528433, 68.026405, 40.859294, 45.39550133, 96.65918833,
82.80520367, 82.48830233, 76.22586667, 92.65016433, 164.821377,
67.04030333, 82.30652767, 59.10089967), mcarb = c(144.6609883,
207.803092, 301.791884, 154.252719, 192.215434, 125.836917, 326.8027877,
117.3693597, 151.8666383, 226.6798, 246.8333723, 455.0111473,
217.4003043, 209.0277287, 254.0715917), mtfat = c(64.452471,
73.34697467, 37.79965033, 72.50962033, 38.87718467, 31.354984,
111.493208, 56.441886, 73.22733933, 56.61331667, 67.261771, 121.9704157,
55.08478833, 94.518705, 100.8741383), PC1 = c(-0.447910206, -0.294634121,
-1.104462969, -0.547207734, -1.954444086, -2.196982329, 2.746913539,
-1.023090581, -0.764200454, -0.584591205, 0.77843409, 5.614654485,
-0.999691479, 0.279942766, 0.896578187), PC2 = c(-0.642332236,
0.049369806, -0.216059532, 1.160722893, 1.078477828, -0.150613681,
1.895259257, -1.909344827, 1.644354816, 1.614658854, 0.433529118,
-1.669928792, -0.560657387, -1.145066836, 1.866870422), PC3 = c(-0.451625917,
-0.772244866, 1.06416389, -0.408526673, 0.337918493, -0.254740649,
1.480378587, 0.583072925, -1.619576656, -1.637944088, -0.430379578,
-0.512822799, 2.018634475, 0.26331773, 3.128258848), PC4 = c(-0.968856054,
0.16683708, 0.914246075, -0.219132873, 0.670302106, 0.368790712,
0.642579887, -1.921774612, 0.016672151, 1.765303371, 0.683175144,
0.884292702, -0.388954363, -1.532636673, -1.199798116)), class = "data.frame", row.names = c(NA,
-15L))
这是我使用的代码。
usdtricepby6predictors <- read.csv("usdtricepby8predictors2.csv", header = TRUE, na.strings = ".", stringsAsFactors = FALSE)
usdtricepby6predictors <- na.omit(usdtricepby6predictors)
usdtricepby6predictors <- sapply(usdtricepby6predictors, as.numeric)
usdtricepby6predictors <- as.data.frame(usdtricepby6predictors)
predict.regsubsets=function(object,newdata,id,...){ #predic function
form=as.formula(object$call[[2]]) ## extract formula
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
k=10
set.seed(1)
folds=sample(1:k,nrow(usdtricepby6predictors),replace=TRUE)
cv.errors=matrix(NA,k,8, dimnames=list(NULL, paste(1:8)))
for(j in 1:k) {
best.fit=regsubsets(tricepind~.,data=usdtricepby6predictors[folds!=j,], nvmax=8)
for(i in 1:8){
pred=predict(best.fit,usdtricepby6predictors[folds==j,],id=i)
cv.errors[j,i]=mean((usdtricepby6predictors$tricepind[folds==j]-pred)^2)
}
}
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
points(which.min(mean.cv.errors),mean.cv.errors[which.min(mean.cv.errors)],
col="red",cex=2,pch=20)
reg.best=regsubsets(tricepind~.,data=usdtricepby6predictors,nvmax=8)
coef(reg.best,which.min(mean.cv.errors))