我已经构建了一个基本函数来从我对几个变量感兴趣的3个模型中提取AIC和BIC值。然而,当它运行时,我的计算机经常停止,并说它无法为一个向量分配200MB(我使用的是大型数据集 - 超过500K的情况,是的,我已将内存限制增加到max-4000)。
如果我一次选择几个变量,我实际上设法运行它。我对实际运行该功能感兴趣,但也改进了我的功能代码,这样我就不必在运行之前删除其他所有内容,而且可能不需要等待30分钟。我可能会使用修正的AIC和BIC公式并添加其他内容,因此我宁愿保持AIC和BIC矢量化,而不是切换到其他逻辑回归函数。我玩过它并添加了诸如rm(model1)之类的东西,但它可能差别很小。你能建议解决内存分配问题的代码并可能加快功能吗?
非常感谢
功能:
myF<-function(mydata,TotScore,group){
BIC2<-BIC1<-BIC0<-AIC2<-AIC1<-AIC0<-rep(NA,length(ncol(mydata)))
for (i in (1:ncol(mydata))){
M0<-glm(mydata[,i] ~ TotScore,family=binomial,data=mydata,x=F,y=F,model=F)
AIC0[i]<-extractAIC(M0)[2]
BIC0[i]<-extractAIC(M0,k=log(length(M0$fitted.values)))[2]
rm(M0)
M1<-glm(mydata[,i] ~ TotScore+group,family=binomial,data=mydata,x=F,y=F,model=F)
AIC1[i]<-extractAIC(M1)[2]
BIC1[i]<-extractAIC(M1,k=log(length(M1$fitted.values)))[2]
rm(M1)
M2<-glm(mydata[,i] ~ TotScore+group+TotScore*group,family=binomial,data=mydata,x=F,y=F,model=F)
AIC2[i]<-extractAIC(M2)[2]
BIC2[i]<-extractAIC(M2,k=log(length(M2$fitted.values)))[2]
rm(M2)
}
Results<-cbind(AIC0,AIC1,AIC2,BIC0,BIC1,BIC2)
rownames(Results)<-names(mydata)
return(Results)
}
P.S。可以使用
尝试该模型##Random dataset example
v1<-sample(0:1, 500000, replace=TRUE, prob=c(.80,.20))
v2<-sample(0:1, 500000, replace=TRUE, prob=c(.85,.15))
v3<-sample(0:1, 500000, replace=TRUE, prob=c(.95,.05))
mydata<-as.data.frame(cbind(v1,v2,v3))
TotScore=rowSums(mydata)
group<-(rep (1:5,100000))
myF(mydata,TotScore,group)
答案 0 :(得分:2)
关于具有离散预测变量的二项式数据的好处在于,您可以在不丢失信息的情况下聚合数据
set.seed(12345)
v1<-sample(0:1, 500000, replace=TRUE, prob=c(.80,.20))
v2<-sample(0:1, 500000, replace=TRUE, prob=c(.85,.15))
v3<-sample(0:1, 500000, replace=TRUE, prob=c(.95,.05))
mydata<-as.data.frame(cbind(v1,v2,v3))
mydata$TotScore <- rowSums(mydata)
mydata$group <- rep (1:5,100000)
library(reshape)
myFun2 <- function(Y, dataset){
tmp <- as.data.frame(table(TotScore = dataset$TotScore, group = dataset$group, Response = dataset[, Y]))
levels(tmp$Response) <- c("Failure", "Succes")
tmp <- cast(TotScore + group ~ Response, data = tmp, value = "Freq")
tmp$TotScore <- as.numeric(levels(tmp$TotScore))[tmp$TotScore]
output <- rep(NA, 6)
names(output) <- paste(rep(c("AIC", "BIC"), 3), rep(0:2, each = 2), sep = "")
m <- glm(cbind(Succes, Failure) ~ TotScore, data = tmp, family = binomial,
model = FALSE, x = FALSE, y = FALSE)
output[1:2] <- c(AIC(m), BIC(m))
m <- glm(cbind(Succes, Failure) ~ TotScore + group, data = tmp, family = binomial,
model = FALSE, x = FALSE, y = FALSE)
output[3:4] <- c(AIC(m), BIC(m))
m <- glm(cbind(Succes, Failure) ~ TotScore * group, data = tmp, family = binomial,
model = FALSE, x = FALSE, y = FALSE)
output[5:6] <- c(AIC(m), BIC(m))
output
}
system.time({
sapply(colnames(mydata)[1:3], myFun, dataset = mydata)
})
user system elapsed
3.10 0.06 3.15
答案 1 :(得分:0)
library(difR)
data(verbal)
verbal$TotScore <- rowSums(verbal[, c(1:24)])
verbal$group <- with(verbal, factor(Gender):factor(Anger > 20))
myFun <- function(Y, dataset){
output <- rep(NA, 6)
names(output) <- paste(rep(c("AIC", "BIC"), 3), rep(0:2, each = 2), sep = "")
m <- glm(as.formula(paste(Y, "~ TotScore")), data = dataset, family = binomial,
model = FALSE, x = FALSE, y = FALSE)
output[1:2] <- c(AIC(m), BIC(m))
m <- glm(as.formula(paste(Y, "~ TotScore + group")), data = dataset,
family = binomial, model = FALSE, x = FALSE, y = FALSE)
output[3:4] <- c(AIC(m), BIC(m))
m <- glm(as.formula(paste(Y, "~ TotScore * group")), data = dataset,
family = binomial, model = FALSE, x = FALSE, y = FALSE)
output[5:6] <- c(AIC(m), BIC(m))
output
}
sapply(colnames(verbal)[1:2], myFun, dataset = verbal)