我有一个quasibinomial glm,有两个连续的解释变量(让我们说“LogPesticide”和“LogFood”)和一个互动。我想在不同食物量(例如最小和最大食物价值)下以置信区间计算农药的LC50。如何实现这一目标?
示例:首先,我生成一个数据集。
jira = JIRA('http://jiraurl.com/', basic_auth=('user', 'pass'))
new_issue = jira.create_issue(project='ER', summary='summary', description='desc', issuetype={'name': 'Custom Issue Type'})
new_issue.update(reporter='new_user')
然后我适应了整个glm。模型诊断正常,所有交互术语都很重要。
mydata <- data.frame(
LogPesticide = rep(log(c(0, 0.1, 0.2, 0.4, 0.8, 1.6) + 0.05), 4),
LogFood = rep(log(c(1, 2, 4, 8)), each = 6)
)
set.seed(seed=16)
growth <- function(x, a = 1, K = 1, r = 1) { # Logistic growth function. a = position of turning point
Fx <- (K * exp(r * (x - a))) / (1 + exp(r * (x - a))) # K = carrying capacity
return(Fx) # r = growth rate (larger r -> narrower curve)
}
y <- rep(NA, length = length(mydata$LogPesticide))
y[mydata$LogFood == log(1)] <- growth(x = mydata$LogPesticide[mydata$LogFood == log(1)], a = log(0.1), K = 1, r = 6)
y[mydata$LogFood == log(2)] <- growth(x = mydata$LogPesticide[mydata$LogFood == log(2)], a = log(0.2), K = 1, r = 4)
y[mydata$LogFood == log(4)] <- growth(x = mydata$LogPesticide[mydata$LogFood == log(4)], a = log(0.4), K = 1, r = 3)
y[mydata$LogFood == log(8)] <- growth(x = mydata$LogPesticide[mydata$LogFood == log(8)], a = log(0.8), K = 1, r = 1)
mydata$Dead <- rbinom(n = length(y), size = 20, prob = y)
mydata$Alive <- 20 - mydata$Dead
mydata$Mortality <- cbind(mydata$Dead, mydata$Alive)
我尝试用MASS包中的dose.p()估算LC50。如果LogFood是一个因素,当我按照this post中的讨论重新拟合模型时,这将起作用。但是有两个连续的解释变量,你只得到1个截距,2个斜率和斜率的差异(用于交互)。
我可以使用effect()估算LC50,但不知道如何获取LogPesticide的关联CI。
model <- glm(Mortality ~ LogPesticide * LogFood, family = quasibinomial, data = mydata)
plot(model)
Anova(model)
summary(model)
从dose.p()的代码我看到必须使用vcov矩阵。 effect()还提供了一个vcov矩阵,但我无法修改dose.p()以正确处理该信息。我会很感激任何想法!
答案 0 :(得分:3)
复制数据(更新:ggplot2
的新版本可能不喜欢带有矩阵的奇怪数据框?)
mydata <- data.frame(
LogPesticide = rep(log(c(0, 0.1, 0.2, 0.4, 0.8, 1.6) + 0.05), 4),
LogFood = rep(log(c(1, 2, 4, 8)), each = 6)
)
set.seed(seed=16)
growth <- function(x, a = 1, K = 1, r = 1) {
## Logistic growth function. a = position of turning point
## K = carrying capacity
## r = growth rate (larger r -> narrower curve)
return((K * exp(r * (x - a))) / (1 + exp(r * (x - a))))
}
rlf <- data.frame(LogFood=log(c(1,2,4,8)),
a=log(c(0.1,0.2,0.4,0.8)),
r=6,4,3,1)
mydata <- merge(mydata,rlf)
mydata <- plyr::mutate(mydata,
y=growth(LogPesticide,a,K=1,r),
Dead=rbinom(n=nrow(mydata),size=20,prob=y),
N=20,
Alive=N-Dead,
pmort=Dead/N)
model <- glm(pmort ~ LogPesticide * LogFood, family = quasibinomial,
data = mydata, weights=N)
为方便起见:
cc <- setNames(coef(model),c("b_int","b_P","b_F","b_PF"))
vv <- vcov(model)
dimnames(vv) <- list(names(cc),names(cc))
基本预测数据框:
pframe <- with(mydata,
expand.grid(LogPesticide=seq(min(LogPesticide),max(LogPesticide),
length=51),
LogFood=unique(LogFood)))
pframe$pmort <- predict(model,newdata=pframe,type="response")
现在让我们打破这个。给定水平的(log)食物F
和(log)农药P
的预测值是
logit(surv) = b_int + b_P*P + b_F*F + b_PF*F*P
因此,F
级农药的逻辑曲线是
logit(surv) = (b_int+b_F*F) + (b_P+b_PF*F)*P
我们想知道logit(surv)为0(LC50)的P
的值,所以我们需要
0 = (b_int+b_F*F) + (b_P+b_PF*F)*P50
P50 = -(b_int+b_F*F)/(b_P+b_PF*F)
翻译代码:
P50mean <- function(logF) {
with(as.list(cc), -(b_int+b_F*logF)/(b_P+b_PF*logF))
}
with(mydata,P50mean(c(min=min(LogFood),max=max(LogFood))))
pLC50 <- data.frame(LogFood=unique(mydata$LogFood))
pLC50 <- transform(pLC50,
pmort=0.5,
LogPesticide=P50mean(LogFood))
为了获得置信区间,两种最简单的方法是(1)delta方法和(2)后验预测区间(在某些情况下也称为参数Bayes&#39;)。 (您也可以进行非参数自举。)
我试图手工做到这一点,但意识到它太毛茸茸了(所有四个系数都是强相关的,所有这些相关性都必须在计算中保持跟踪 - 它并不像分子和分母是独立值的通常公式......)
library("emdbook")
deltavar(-(b_int+b_F*2)/(b_P+b_PF*2),meanval=cc,Sigma=vv)
## have to be a bit fancy here with eval/substitute ...
pLC50$var1 <- sapply(pLC50$LogFood,
function(logF)
eval(substitute(
deltavar(-(b_int+b_F*logF)/(b_P+b_PF*logF),
meanval=cc,Sigma=vv),
list(logF=logF))))
这假设(稍微弱一点)参数的采样分布是多元正态。
PP <- function(logF,n=1000) {
b <- MASS::mvrnorm(n,mu=cc,Sigma=vv)
pred <- with(as.data.frame(b),
-(b_int+b_F*logF)/(b_P+b_PF*logF))
return(var(pred))
}
set.seed(101)
pLC50$var2 <- sapply(pLC50$LogFood,PP)
PPI实际上允许我们通过获得预测LC50的分布的分位数来稍微放宽假设......事实证明(见下文)基于PPI的置信区间比Delta更宽一点方法的,但他们并没有相差甚远。
现在描绘整个混乱:
library(ggplot2); theme_set(theme_bw())
gg0 <- ggplot(mydata,aes(LogPesticide,pmort,
colour=factor(LogFood),
fill = factor(LogFood))) + geom_point() +
## individual fits -- a bit ugly
## geom_smooth(method="glm",aes(weight=N),
## method.args=list(family=binomial),alpha=0.1)+
geom_line(data=pframe,linetype=2)+
geom_point(data=pLC50,pch=5,size=4)+
geom_hline(yintercept=0.5,col="gray")
gg0 + geom_errorbarh(data=pLC50,lwd=2,alpha=0.5,
aes(xmin=LogPesticide-1.96*sqrt(var1),
xmax=LogPesticide+1.96*sqrt(var1)),
height=0)+
geom_errorbarh(data=pLC50,
aes(xmin=LogPesticide-1.96*sqrt(var2),
xmax=LogPesticide+1.96*sqrt(var2)),
height=0.02)