Question

我有几个模型适合预测结果y = x1 + x2 + ..... + x22。这是一个相当数量的预测因子和相当数量的模型。我的客户想知道每个X对估计y的边际影响是什么。模型可以包括样条和交互项。我可以做到这一点，但它很麻烦，需要循环或大量复制粘贴，这很慢或容易出错。我可以通过不同的方式编写我的函数和/或使用purrr或*apply函数来做得更好吗？可重复的例子如下。理想情况下，我可以编写一个函数并将其应用于longdata。

##  create my fake data.

library(tidyverse)
library (rms)
ltrans<- function(l1){ 
  newvar <- exp(l1)/(exp(l1)+1)
  return(newvar)
}

set.seed(123)
mystates <- c("AL","AR","TN")
mydf <- data.frame(idno = seq(1:1500),state = rep(mystates,500))
mydf$x1[mydf$state=='AL'] <- rnorm(500,50,7)
mydf$x1[mydf$state=='AR'] <- rnorm(500,55,8)
mydf$x1[mydf$state=='TN'] <- rnorm(500,48,10)
mydf$x2 <- sample(1:5,500, replace = T)
mydf$x3 <- (abs(rnorm(1500,10,20)))^2
mydf$outcome <- as.numeric(cut2(sample(1:100,1500,replace = T),95))-1
dd<- datadist(mydf)
options(datadist = 'dd')
m1 <- lrm(outcome ~ x1 + x2+ rcs(x3,3), data = mydf)

dothemath <- function(x1 = x1ref,x2 = x2ref,x3 = x3ref) {
  ltrans(-2.1802256-0.01114239*x1+0.050319692*x2-0.00079289232* x3+
             7.6508189e-10*pmax(x3-7.4686271,0)^3-9.0897627e-10*pmax(x3-    217.97865,0)^3+
           1.4389439e-10*pmax(x3-1337.2538,0)^3)}
x1ref <- 51.4
x2ref <- 3
x3ref <- 217.9
dothemath() ## 0.0591
mydf$referent <- dothemath()
mydf$thisobs <- dothemath(x1 = mydf$x1, x2 = mydf$x2, x3 = mydf$x3)
mydf$predicted <- predict(m1,mydf,type = "fitted.ind") ## yes, matches.
mydf$x1_marginaleffect <- dothemath(x1= mydf$x1)/mydf$referent
mydf$x2_marginaleffect <- dothemath(x2 = mydf$x2)/mydf$referent    
mydf$x3_marginaleffect <- dothemath(x3 = mydf$x3)/mydf$referent

## can I do this with long data?
longdata <- mydf %>%
  select(idno,state,referent,thisobs,x1,x2,x3) %>%
  gather(varname,value,x1:x3)

##longdata$marginaleffect <- dothemath(longdata$varname = longdata$value) ##     no, this does not work.
## I need to communicate to the function which variable it is evaluating. 
longdata$marginaleffect[longdata$varname=="x1"] <- dothemath(x1 =         longdata$value[longdata$varname=="x1"])/
                                                longdata$referent[longdata$varname=="x1"]
longdata$marginaleffect[longdata$varname=="x2"] <- dothemath(x2 = longdata$value[longdata$varname=="x2"])/
                                                    longdata$referent[longdata$varname=="x2"]
longdata$marginaleffect[longdata$varname=="x3"] <- dothemath(x3 = longdata$value[longdata$varname=="x3"])/
                                                    longdata$referent[longdata$varname=="x3"]

testing<- inner_join(longdata[longdata$varname=="x1",c(1,7)],mydf[,c(1,10)])
head(testing) ## yes, both methods work.

Answer 1

大多数情况下，您只是在讨论分组mutate，但需要注意dothemath，以便您需要指定变量名称，这可以通过{{1}来完成}或do.call在命名的参数列表上调用它：

purrr::invoke

可能更容易重新配置longdata <- longdata %>% group_by(varname) %>% mutate(marginaleffect = invoke(dothemath, setNames(list(value), varname[1])) / referent) longdata #> # A tibble: 4,500 x 7 #> # Groups: varname [3] #> idno state referent thisobs varname value marginaleffect #> <int> <fct> <dbl> <dbl> <chr> <dbl> <dbl> #> 1 1 AL 0.0591 0.0688 x1 46.1 1.06 #> 2 2 AR 0.0591 0.0516 x1 50.2 1.01 #> 3 3 TN 0.0591 0.0727 x1 38.0 1.15 #> 4 4 AL 0.0591 0.0667 x1 48.4 1.03 #> 5 5 AR 0.0591 0.0515 x1 47.1 1.05 #> 6 6 TN 0.0591 0.0484 x1 37.6 1.15 #> 7 7 AL 0.0591 0.0519 x1 60.9 0.905 #> 8 8 AR 0.0591 0.0531 x1 63.2 0.883 #> 9 9 TN 0.0591 0.0780 x1 47.8 1.04 #> 10 10 AL 0.0591 0.0575 x1 50.5 1.01 #> # ... with 4,490 more rows # the first values look similar inner_join(longdata[longdata$varname == "x1", c(1,7)], mydf[,c(1,10)]) #> Joining, by = "idno" #> # A tibble: 1,500 x 3 #> idno marginaleffect x1_marginaleffect #> <int> <dbl> <dbl> #> 1 1 1.06 1.06 #> 2 2 1.01 1.01 #> 3 3 1.15 1.15 #> 4 4 1.03 1.03 #> 5 5 1.05 1.05 #> 6 6 1.15 1.15 #> 7 7 0.905 0.905 #> 8 8 0.883 0.883 #> 9 9 1.04 1.04 #> 10 10 1.01 1.01 #> # ... with 1,490 more rows # check everything is the same mydf %>% gather(varname, marginaleffect, x1_marginaleffect:x3_marginaleffect) %>% select(idno, varname, marginaleffect) %>% mutate(varname = substr(varname, 1, 2)) %>% all_equal(select(longdata, idno, varname, marginaleffect)) #> [1] TRUE以获取变量名称的附加参数，以避免体操。

通过迭代变量来计算预测模型结果

1 个答案: