我在使用predict()
函数在新(测试)数据集上使用mgcv::gam
(训练)模型时遇到问题。问题出现的原因是mrf
平滑,我已经整合了以解释数据的空间特性。
我使用以下调用来创建我的GAM模型
## Run GAM with MRF
m <- gam(crime ~ s(district,k=nrow(traindata),
bs ='mrf',xt=list(nb=nbtrain)), #define MRF smooth
data = traindata,
method = 'REML',
family = scat(), #fit scaled t distribution
gamma = 1.4
)
我使用邻域结构预测因变量crime
,在平滑术语参数xt
中解析为模型。邻域结构是我使用nb
函数创建的poly2nb()
对象。
现在,如果我想在新的测试数据集上使用predict()
,我不知道如何将相关的邻域结构传递给调用。只提供新数据
pred <- predict.gam(m,newdata=testdata)
引发以下错误:
Error in predict.gam(m, newdata = testdata) :
7, 16, 20, 28, 35, 36, 37, 43 not in original fit
这是使用直接从R内部调用的Columbus数据集完整再现错误:
#ERROR REPRODUCTION
## Load packages
require(mgcv)
require(spdep)
require(dplyr)
## Load Columbus Ohio crime data (see ?columbus for details and credits)
data(columb.polys) #Columbus district shapes list
columb.polys <- lapply(columb.polys,na.omit) #omit NAs (unfortunate problem with the Columbus sample data)
data(columb) #Columbus data frame
df <- data.frame(district=numeric(0),x=numeric(0),y= numeric(0)) #Create empty df to store x, y and IDs for each polygon
## Extract x and y coordinates from each polygon and assign district ID
for (i in 1:length(columb.polys)) {
district <- i-1
x <- columb.polys[[i]][,1]
y <- columb.polys[[i]][,2]
df <- rbind(df,cbind(district,x,y)) #Save in df data.frame
}
## Convert df into SpatialPolygons
sp <- df %>%
group_by(district) %>%
do(poly=select(., x, y) %>%Polygon()) %>%
rowwise() %>%
do(polys=Polygons(list(.$poly),.$district)) %>%
{SpatialPolygons(.$polys)}
## Merge SpatialPolygons with data
spdf <- SpatialPolygonsDataFrame(sp,columb)
## Split into training and test sample (80/20 ratio)
splt <- sample(1:2,size=nrow(spdf),replace=TRUE,prob=c(0.8,0.2))
train <- spdf[splt==1,]
test <- spdf[splt==2,]
## Prepapre both samples and create NB objects
traindata <- train@data #Extract data from SpatialPolygonsDataFrame
testdata <- test@data
traindata <- droplevels(as(train, 'data.frame')) #Drop levels
testdata <- droplevels(as(test, 'data.frame'))
traindata$district <- as.factor(traindata$district) #Factorize
testdata$district <- as.factor(testdata$district)
nbtrain <- poly2nb(train, row.names=train$Precinct, queen=FALSE) #Create NB objects for training and test sample
nbtest <- poly2nb(test, row.names=test$Precinct, queen=FALSE)
names(nbtrain) <- attr(nbtrain, "region.id") #Set region.id
names(nbtest) <- attr(nbtest, "region.id")
## Run GAM with MRF
m <- gam(crime ~ s(district, k=nrow(traindata), bs = 'mrf',xt = list(nb = nbtrain)), # define MRF smooth
data = traindata,
method = 'REML', # fast version of REML smoothness selection; alternatively 'GCV.Cp'
family = scat(), #fit scaled t distribution
gamma = 1.4
)
## Run prediction using new testing data
pred <- predict.gam(m,newdata=testdata)
答案 0 :(得分:1)
解决方案:
我终于找到了用解决方案更新此帖子的时间。感谢大家的帮助。这是用于通过随机训练-测试拆分来实现k倍CV的代码:
#Apply k-fold cross validation
mses <- data.frame() #Create empty df to store CV squared error values
scores <- data.frame() #Create empty df to store CV R2 values
set.seed(42) #Set seed for reproducibility
k <- 10 #Define number of folds
for (i in 1:k) {
# Create weighting column
data$weight <- sample(c(0,1),size=nrow(data),replace=TRUE,prob=c(0.2,0.8)) #0 Indicates testing sample, 1 training sample
#Run GAM with MRF
ctrl <- gam.control(nthreads = 6) #Set controls
m <- gam(crime ~ s(disctrict, k=nrow(data), bs = 'mrf',xt = list(nb = nb)), #define MRF smooth
data = data,
weights = data$weight, #Use only weight==1 observations (training)
method = 'REML',
control = ctrl,
family = scat(),
gamma = 1.4
)
#Generate test dataset
testdata <- data[data$weight==0,] #Select test data by weight
#Predict test data
pred <- predict(m,newdata=testdata)
#Extract MSES
mses[i,1] <- mean((data$R_MeanDiff[data$weight==0] - pred)^2)
scores[i,1] <- summary(m)$r.sq
}
av.mse.GMRF <- mean(mses$V1)
av.r2.GMRF <- mean(scores$V1)
答案 1 :(得分:0)
对于当前的解决方案,我有一个问题的批评,那就是完整的数据集被用来“训练”模型,这意味着由于使用了测试数据来训练模型,因此预测将出现偏差。
这仅需进行一些小调整即可解决:
#Apply k-fold cross validation
mses <- data.frame() #Create empty df to store CV squared error values
scores <- data.frame() #Create empty df to store CV R2 values
set.seed(42) #Set seed for reproducibility
k <- 10 #Define number of folds
#For loop for each fold
for (i in 1:k) {
# Create weighting column
data$weight <- sample(c(0,1),size=nrow(data),replace=TRUE,prob=c(0.2,0.8)) #0 Indicates testing sample, 1 training sample
#Generate training dataset
trainingdata <- data[data$weight == 1, ] #Select test data by weight
#Generate test dataset
testdata <- data[data$weight == 0, ] #Select test data by weight
#Run GAM with MRF
ctrl <- gam.control(nthreads = 6) #Set controls
m <- gam(crime ~ s(disctrict, k=nrow(data), bs = 'mrf',xt = list(nb = nb)), #define MRF smooth
data = trainingdata,
weights = data$weight, #Use only weight==1 observations (training)
method = 'REML',
control = ctrl,
family = scat(),
gamma = 1.4
)
#Predict test data
pred <- predict(m,newdata = testdata)
#Extract MSES
mses[i,1] <- mean((data$R_MeanDiff[data$weight==0] - pred)^2)
scores[i,1] <- summary(m)$r.sq
}
#Get average scores from each k-fold test
av.mse.GMRF <- mean(mses$V1)
av.r2.GMRF <- mean(scores$V1)