Question

我绘制了一系列条形图，这些条形图表示比例的置信区间的模拟。我想在每个栏中添加一行以表示成功比例。

我要绘制的比例在该图的数据框中。我还没有弄清楚如何在每个单独的条形图中为该数据点添加一个线元素。

该可视化来自Harvey Matulsky的《直观生物统计学》的第36页。这是从给定样本空间中抽取样本，记录成功比例并计算置信区间的模拟。

我使用geom_segment绘制了条形图，因此我可以使条形图在置信区间的下端开始，而不是在x轴上绘制。我在整个图表上添加了一条水平线，以显示示例空间中成功的真实比例（红色和白色球集合中的红色球）。

我尝试将geom_hline和geom_segment映射到数据点trials_df $ proportion。我不能正确地做到这一点。

这是我整个可视化的代码。将其分解为一些函数，然后运行整个模拟，打印图的数据框，然后运行我到目前为止的图（每个条上没有比例线）。


    library(ggplot2)

    run_trials <- function(sample_space, N) {
            sample(sample_space,
                   size = N,
                   replace = TRUE)
    }

    success_count <- function(trials, success_value) {
            result <- sum(trials == success_value)
            result
    }

    proportion <- function(trials, success_value) {
            success_count(trials, success_value) / length(trials)
    }

    wald_mod <- function(success_count, trial_count) {
            z <- 1.96
            p_prime <- (success_count + (0.5 * z^2)) / (trial_count + z^2)
            W <- z * sqrt((p_prime * (1 - p_prime)) / (trial_count + z^2))
            result <- c((p_prime - W), (p_prime + W))
            result
    }

    get_trial_results <- function(trials, success_value) {
            p <- proportion(trials, success_value)
            successes <- success_count(trials, success_value)
            confidence_interval <- wald_mod(successes, length(trials))

            result <- list(p, confidence_interval)
            result
    }

    run_simulation <- function() {
            sample_space <- c(rep('Red', 25), rep('White', 75))
            N <- 15

            trials_df <- data.frame(trials_index = integer(),
                                    proportion = double(),
                                    ci_min = double(),
                                    ci_max = double())

            for (i in 1:20) {
                    t <- run_trials(sample_space, N)
                    t_results <- get_trial_results(t, "Red")
                    trials_df <- rbind(trials_df, c(i, t_results[[1]][1], t_results[[2]][1], t_results[[2]][2]))
            }
            names(trials_df) <- c("trials_index", "proportion", "ci_min", "ci_max")

            print(trials_df)

            ggplot(trials_df, aes(trials_index, ci_max)) +
                    geom_segment(aes(xend = trials_index, yend = ci_min), size = 4, lineend = "butt",
                                 color = "turquoise4") +
                    geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed")
    }

    run_simulation()

我在代码中添加了@Simon的解决方案，并改善了地块的标签。开发这个小模拟程序有助于我理解置信区间。


    library(ggplot2)

    run_experiment <- function(sample_space, N) {
            sample(sample_space,
                   size = N,
                   replace = TRUE)
    }

    success_count <- function(experiment, success_value) {
            result <- sum(experiment == success_value)
            result
    }

    proportion <- function(experiment, success_value) {
            success_count(experiment, success_value) / length(experiment)
    }

    wald_mod <- function(success_count, trial_count) {
            z <- 1.96
            p_prime <- (success_count + (0.5 * z^2)) / (trial_count + z^2)
            W <- z * sqrt((p_prime * (1 - p_prime)) / (trial_count + z^2))
            result <- c((p_prime - W), (p_prime + W))
            result
    }

    get_experiment_results <- function(experiment, success_value) {
            p <- proportion(experiment, success_value)
            successes <- success_count(experiment, success_value)
            confidence_interval <- wald_mod(successes, length(experiment))
            p_plot_value <- confidence_interval[1] + p * abs(diff(confidence_interval))

            result <- list(c(p, p_plot_value), confidence_interval)
            result
    }

    run_simulation <- function() {
            sample_space <- c(rep('Red', 25), rep('White', 75))
            N <- 15

            experiments_df <- data.frame()

            for (i in 1:20) {
                    t <- run_experiment(sample_space, N)
                    t_results <- get_experiment_results(t, "Red")

                    experiments_df <- rbind(experiments_df, c(i, t_results[[1]][[1]], t_results[[1]][[2]], t_results[[2]][[1]], t_results[[2]][[2]]))
            }
            names(experiments_df) <- c("experiment_index", "proportion", "proportion_plot_value", "ci_min", "ci_max")

            print(experiments_df)

            # Jaap's answer on SO solves floating bar plot.
            # https://stackoverflow.com/questions/29916770/geom-bar-from-min-to-max-data-value
            # Simon's answer to me on SO solves plotting the proportion.
            # https://stackoverflow.com/questions/29916770/geom-bar-from-min-to-max-data-value
            ggplot(experiments_df, aes(experiment_index)) +
                    geom_segment(aes(xend = experiment_index, yend = ci_min, y = ci_max), size = 4, lineend = "butt",
                                 color = "turquoise4") +
                    geom_segment(aes(xend = experiment_index, yend = proportion_plot_value-.001, y = proportion_plot_value+.001), size = 4, lineend = "butt",
                                 color = "black") +
                    geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed") +
                    coord_cartesian(ylim = c(0, 1)) +
                    labs(x = "Experiment", y = "Probability",
                         title = "Each bar shows 95% CI computed from one
    simulated experiment",
                         subtitle = "Dashed line is true proportion in sample space",
                         caption = "Intuitive Biostatistics. Harvey Mitulsky. p. 36") 
    }

    run_simulation()

My final plot (which my reputation points don't yet permit me to paste)

Answer 1

首先计算相对于钢筋下端的比例：

trials_df <- data.frame(trials_index = integer(),
                          proportion = double(),
                          ci_min = double(),
                          ci_max = double())

  for (i in 1:20) {
    t <- run_trials(sample_space, N)
    t_results <- get_trial_results(t, "Red")
    trials_df <- rbind(trials_df, c(i, t_results[[1]][1], t_results[[2]][1], t_results[[2]][2], t_results[[2]][1]+t_results[[1]][1]*asbs(diff(t_results[[2]][2], t_results[[2]][1]))))
  }
  names(trials_df) <- c("trials_index", "proportion", "ci_min", "ci_max", 'proportion_max')

对于每个条形上的一条很小的水平线，您都可以这样做：

  ggplot(trials_df, aes(trials_index, ci_max)) +
    geom_segment(aes(xend = trials_index, yend = ci_min), size = 4, #lineend = "butt",
                 color = "turquoise4") +
    geom_segment(aes(xend = trials_index, yend = proportion_max-.001, y = proportion_max+.001), size = 4, lineend = "butt",
                 color = "turquoise3") +
    geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed")

您想要这些吗？

要给每个条形的较低比例着色，可以执行以下操作：

ggplot(trials_df, aes(trials_index, ci_max)) +
    geom_segment(aes(xend = trials_index, yend = ci_min), size = 4, #lineend = "butt",
                 color = "turquoise4") +
    geom_segment(aes(xend = trials_index, yend = ci_min, y = proportion_max), size = 4, lineend = "butt",
                 color = "turquoise3") +
    geom_abline(slope = 0, intercept = proportion(sample_space, "Red"), linetype = "dashed")

如何用不同的水平线注释ggplot2中的每个条形？

1 个答案: