我一直在通过ISLR
软件包研究一些统计学习模型的示例。该代码在此处(https://rpubs.com/davoodastaraky/subset)可用,任何人都可以看到。我也把它放在下面很容易。
library(ISLR)
library(leaps)
data(Hitters)
Hitters
regfit.full = regsubsets(Salary ~ ., data = Hitters, nvmax = 19)
reg.summary = summary(regfit.full)
#plot rss
library(ggvis)
rsq <- as.data.frame(reg.summary$rsq)
names(rsq) <- "R2"
rsq %>%
ggvis(x=~ c(1:nrow(rsq)), y=~R2 ) %>%
layer_points(fill = ~ R2 ) %>%
add_axis("y", title = "R2") %>%
add_axis("x", title = "Number of variables")
par(mfrow=c(2,2))
plot(reg.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
plot(reg.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted
RSq",type="l")
# which.max(reg.summary$adjr2)
points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20)
plot(reg.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
# which.min(reg.summary$cp )
points(10,reg.summary$cp [10],col="red",cex=2,pch=20)
plot(reg.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
# which.min(reg.summary$bic )
points(6,reg.summary$bic [6],col="red",cex=2,pch=20)
plot(regfit.full,scale="bic")
set.seed (1)
train = sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
test =(! train )
predict.regsubsets =function (object ,newdata ,id ,...){
form=as.formula(object$call [[2]])
mat=model.matrix(form,newdata)
coefi=coef(object ,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
regfit.best=regsubsets(Salary~.,data=Hitters ,nvmax=19)
coef(regfit.best ,10)
k = 10
set.seed(1)
folds = sample(1:k,nrow(Hitters),replace=TRUE)
table(folds)
代码运行平稳,直到我进入下面的部分:
for(j in 1:k){
best.fit = regsubsets(Salary ~., data=Hitters[folds != j,], nvmax = 19)
for (i in 1:19){
pred = predict.regsubsets(best.fit, Hitters[folds == j, ], id = i)
cv.errors[j, i] = mean((Hitters$Salary[folds == j] - pred)^2)
}
}
出现错误的地方:
Error in mean((Hitters$Salary[folds == j] - pred)^2) :
dims [product 18] do not match the length of object [22]
In addition: Warning message:
In Hitters$Salary[folds == j] - pred :
longer object length is not a multiple of shorter object length
我的问题是:为什么会出现此错误?我如何解决它?该代码从字面上取自该网站,无论如何我都没有对其进行更改。显然我缺少有关对象长度的信息。谢谢。
答案 0 :(得分:1)
如果要“修复”此问题,则需要拉出pred
对象的属性,然后根据其Hitters
从rownames()
对象中选择匹配的值。
> str(Hitters$Salary)
num [1:322] NA 475 480 500 91.5 750 70 100 75 1100 ...
> str(pred)
num [1:18, 1] 988 359 370 808 383 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:18] "-Andre Thornton" "-Bob Dernier" "-Chris Brown" "-Chet Lemon" ...
..$ : NULL
> names(Hitters)
[1] "AtBat" "Hits" "HmRun" "Runs" "RBI" "Walks" "Years" "CAtBat"
[9] "CHits" "CHmRun" "CRuns" "CRBI" "CWalks" "League" "Division" "PutOuts"
[17] "Assists" "Errors" "Salary" "NewLeague"
> rownames(Hitters)
[1] "-Andy Allanson" "-Alan Ashby" "-Alvin Davis" "-Andre Dawson"
[5] "-Andres Galarraga" "-Alfredo Griffin" "-Al Newman" "-Argenis Salazar"
[9] "-Andres Thomas" "-Andre Thornton" "-Alan Trammell" "-Alex Trevino"
[13] "-Andy VanSlyke" "-Alan Wiggins" "-Bill Almon" "-Billy Beane"
#omitted the rest of the 322-item column
答案 1 :(得分:0)
Hitters数据集的薪金列中缺少值。只需放下它们,然后按预期工作即可。