我想使用ggplot2包创建一个森林图,我对我的输出感到满意(参见下面的forestplot)。
该图的特征是回归模型中给定变量的水平(优势比和置信度内部)以及参考水平。
问题是需要大量的手工劳动才能生成情节。
第一个问题,我希望参考水平跟随绘图中给定变量的其他水平,所以我手动输入了每个这样的参考水平(参见下面的森林表)。为了使ggplot2工作,我输入了参考电平的任意负比值比和置信区间值,然后将图限制设置为从零到大的正值。
第二个问题,因为我的原始变量在一个列中,我手动输入了颜色,这很耗费时间。
是否有更简单的方法来生成这样的情节?任何帮助将非常感激。
# DATA
mtcars
mtcars$gear <- as.factor(mtcars$gear)
mtcars$carb <- as.factor(mtcars$carb)
# PREPARE ODDS RATIO & CONFIDENCE INTERVALS DATA FRAME
model = lm(mpg ~ gear + carb + disp, data = mtcars ) # make regression model
forest_table = data.frame(
or= round(exp(coef(model)),2),
round(exp(confint(model, level = 0.95)),2),
check.names = F) # make a table with odds ratio and confidence intervals
names(forest_table) = c("or", "ci_lb", "ci_ub") # give columns clear names
library(data.table)
setDT(forest_table, keep.rownames = TRUE)[] # turn row names into a column
forest_table <- as.data.frame(forest_table) # turn table into a data frame
forest_table <- forest_table[-1, ] # get rid of the intercept row
# ADD ROWS WITH REFERENCE LEVELS TO PREPARED DATA FRAME
r <- 2 # row after which new row is to be inserted
newrow <- c("3 reference", -10.00, -9.00, -11.00) # row to be inserted
forest_table <- rbind(forest_table[1:r, ], newrow, forest_table[-(1:r), ]) # insert row
r <- 8 # row after which new row is to be inserted
newrow <- c("1 reference", -10.00, -9.00, -11.00) # row to be inserted
forest_table <- rbind(forest_table[1:r, ], newrow, forest_table[-(1:r), ]) # insert row
# FIX CLASSES IN PREPARED DATA FRAME
forest_table$or <- as.numeric(forest_table$or)
forest_table$ci_lb <- as.numeric(forest_table$ci_lb)
forest_table$ci_ub <- as.numeric(forest_table$ci_ub)
# ADD DUMMY VARIABLE TO CONTROL ORDER IN PLOT
forest_table$order <- as.factor(rep(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))) # create dummy variable
forest_table$order <- factor(forest_table$order,
levels = rev(levels(forest_table$order)))
# use dummy variable to counteract ggplot2 default of reversing the order of levels in
# the prepared data frame when plotting
# PLOT
library(ggplot2)
forestplot <- ggplot(forest_table, aes(or, order)) +
geom_point(size = 5, shape = 18, aes(colour = order)) + # data points
geom_errorbarh(aes(xmax = ci_ub, xmin = ci_lb, colour = order),
height = 0.15) + # error bars
geom_vline(xintercept = 1, linetype = "longdash") + # line marking 0 on x axis
scale_x_continuous(breaks = seq(0, 40000, 10000),
labels = seq(0, 40000, 10000),
limits = c(0, 50000)) + # x axis scale and labels
scale_colour_manual(values = c("blue", "red", "red", "red", "red", "red", "red",
"green", "green", "green")) # manually set one colour per variable
答案 0 :(得分:0)
您可以在表格中添加其他列,指明模型中的实际条款,&amp;根据它分配颜色。这是一个例子:
第1步。从模型系数创建数据框:
new.table <- data.frame(
coef = names(coef(model)),
or = round(exp(coef(model)), 2),
ci_lb = round(exp(confint(model, level = 0.95)), 2)[, 1],
ci_ub = round(exp(confint(model, level = 0.95)), 2)[, 2],
stringsAsFactors = FALSE, row.names = NULL
)
> new.table
coef or ci_lb ci_ub
1 (Intercept) 1.226831e+11 249767693.18 6.026058e+13
2 gear4 5.396000e+01 0.31 9.403190e+03
3 gear5 2.193800e+02 1.03 4.662360e+04
4 carb2 1.400000e-01 0.00 4.340000e+00
5 carb3 2.000000e-02 0.00 1.280000e+00
6 carb4 0.000000e+00 0.00 2.000000e-01
7 carb6 0.000000e+00 0.00 3.700000e-01
8 carb8 0.000000e+00 0.00 2.100000e-01
9 disp 9.800000e-01 0.96 1.000000e+00
第2步。请注意,在&#34; lm&#34; model model$xlevels
包含有关包含多个因子级别的术语的信息。
> model$xlevels
$gear
[1] "3" "4" "5"
$carb
[1] "1" "2" "3" "4" "6" "8"
这可用于创建所有因子水平的参考数据框:
library(dplyr)
library(data.table)
terms.with.levels <- names(model$xlevels)
df.with.levels <- lapply(terms.with.levels,
function(x) data.frame(term = x,
coef = paste0(x, model$xlevels[[x]]),
stringsAsFactors = FALSE)) %>%
rbindlist()
> df.with.levels
term coef
1: gear gear3
2: gear gear4
3: gear gear5
4: carb carb1
5: carb carb2
6: carb carb3
7: carb carb4
8: carb carb6
9: carb carb8
第3步。合并两个数据帧。现在所有参考因子水平都存在,我们有一个列指定术语:
new.table <- merge(new.table, df.with.levels, all = TRUE)
> new.table
coef or ci_lb ci_ub term
1 (Intercept) 1.226831e+11 249767693.18 6.026058e+13 <NA>
2 carb1 NA NA NA carb
3 carb2 1.400000e-01 0.00 4.340000e+00 carb
4 carb3 2.000000e-02 0.00 1.280000e+00 carb
5 carb4 0.000000e+00 0.00 2.000000e-01 carb
6 carb6 0.000000e+00 0.00 3.700000e-01 carb
7 carb8 0.000000e+00 0.00 2.100000e-01 carb
8 disp 9.800000e-01 0.96 1.000000e+00 <NA>
9 gear3 NA NA NA gear
10 gear4 5.396000e+01 0.31 9.403190e+03 gear
11 gear5 2.193800e+02 1.03 4.662360e+04 gear
第4步。进一步修改数据框:
new.table <- new.table %>%
# drop intercept
filter(coef != "(Intercept)") %>%
# indicate whether each row is for a reference level
mutate(is.reference = is.na(or)) %>%
# for non-factor term in the model (e.g. disp) which
# have a single coefficient, term == coef
mutate(term = ifelse(is.na(term), coef, term)) %>%
# set reference levels' x values to 0
mutate_at(vars(or, ci_lb, ci_ub),
funs(ifelse(is.reference, 0, .))) %>%
# order terms according to the model specifications
mutate(term = factor(term,
levels = attr(model$terms, "term.labels")))
> new.table
coef or ci_lb ci_ub term is.reference
1 carb1 0.00 0.00 0.00 carb TRUE
2 carb2 0.14 0.00 4.34 carb FALSE
3 carb3 0.02 0.00 1.28 carb FALSE
4 carb4 0.00 0.00 0.20 carb FALSE
5 carb6 0.00 0.00 0.37 carb FALSE
6 carb8 0.00 0.00 0.21 carb FALSE
7 disp 0.98 0.96 1.00 disp FALSE
8 gear3 0.00 0.00 0.00 gear TRUE
9 gear4 53.96 0.31 9403.19 gear FALSE
10 gear5 219.38 1.03 46623.60 gear FALSE
第5步。创建情节。可以通过将其alpha设置为0(即100%透明度)来隐藏参考电平,并通过facets的顺序控制系数的顺序:
p <- ggplot(new.table,
aes(x = or, xmin = ci_lb, xmax = ci_ub,
y = coef, color = term, alpha = !is.reference)) +
geom_vline(xintercept = 1, linetype = "longdash") +
geom_errorbarh(height = 0.15) +
geom_point(size = 5, shape = 18) +
facet_grid(term~., scales = "free_y", space = "free_y") +
scale_alpha_identity()
第6步。如果需要,进一步调整情节:
p +
# specify colour for each term in the model
scale_color_manual(values = c("gear" = "green",
"carb" = "red",
"disp" = "blue")) +
# hide facet labels
theme(strip.background = element_blank(),
strip.text = element_blank()) +
# remove spacing between facets
theme(panel.spacing = unit(0, "pt"))