绘制子组的比例差异 - 不起作用

时间:2018-04-09 11:22:00

标签: r ggplot2 mutate

我有一个数据框(date,感兴趣的结果v169和群组treated),如下所示:

Dataframe <- data.frame(
  date = structure(c(12482, 12499, 12478, 12484, 12477, 12492, 12475, 12490, 12490, 12482, 12488, 12474, 12487, 12474, 12473, 12473, 12478, 12474, 12481, 12474, 12489, 12485, 12479, 12479, 12479, 12479, 12481, 12477, 12474, 12481, 12481, 12478, 12478, 12473, 12479, 12479, 12478, 12482, 12480, 12478, 12480, 12479, 12475, 12481, 12480, 12477, 12477, 12477, 12476, 12476, 12474, 12473, 12474, 12483, 12472, 12479, 12481, 12488, 12481, 12482, 12481, 12482, 12488, 12478, 12474, 12481, 12481, 12480, 12478, 12479, 12475, 12476, 12478, 12482, 12479, 12478, 12478, 12477, 12479, 12479, 12479, 12479, 12478, 12480, 12478, 12487, 12482, 12475, 12475, 12474, 12474, 12478, 12473, 12485, 12482, 12473, 12474, 12472, 12478, 12478, 12479, 12479, 12488, 12476, 12492, 12493, 12479, 12482, 12480, 12476, 12476, 12482, 12479, 12475, 12472, 12475, 12475, 12475, 12482, 12482, 12482, 12478, 12480, 12485, 12480, 12482, 12481, 12480, 12480, 12480, 12480, 12478, 12481, 12478, 12478, 12479, 12481, 12481, 12482, 12482, 12482, 12479, 12478, 12476, 12483, 12475, 12477, 12477, 12480, 12485, 12485, 12479, 12476, 12480, 12476, 12481, 12485, 12479, 12480, 12484, 12479, 12481, 12487, 12490, 12486, 12482, 12480, 12494, 12493, 12485, 12479, 12477, 12477, 12481, 12481, 12483, 12480, 12479, 12483, 12472, 12474, 12471, 12482, 12479, 12489, 12480, 12494, 12481, 12483, 12483, 12488, 12471, 12476, 12482), class = "Date"),
  v169 = c(1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0),
  treated = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))

我想使用此代码绘制treated == 1treated == 0之间的结果比例v169(如此处所述:Plot difference between proportions for subgroups):

Dataframe %>%
  mutate(
    treated2 = factor(treated, levels = c("0", "1")),
    date = as.POSIXct(date)) %>% #convert date
  group_by(treated2, date) %>% #group
  summarise(
    prop = sum(v169 == "1") / n()) %>% #calculate proportion
  spread(treated2, prop) %>%
  mutate(propdiff = 1 - 0) %>% #I tried using "1" and "0" here but then get the error "Error in mutate_impl(.data, dots) : Evaluation error: non-numeric argument to binary operator."
  ggplot(aes(date, propdiff)) +
  geom_line() + 
  geom_point()

不幸的是,该解决方案无法使用此数据集(差异始终为1.00(如果propdiff = 1 - 0)或-1.00(如果propdiff = 0 - 1,即似乎没有考虑两条数据线中的一条。

我无法弄清楚原因 - 代码与我在另一个问题中提供的样本数据集一起工作但似乎对我无法确定的数据有特殊性。我确定这不是一个经验丰富的程序员花费很多头痛的事情,但我无法弄明白。有人能指出我正确的方向吗?

1 个答案:

答案 0 :(得分:1)

尝试在mutate来电中使用反引号,即“1”和“0”。

library(tidyverse)
Dataframe %>%
 mutate(
  treated2 = factor(treated, levels = c("0", "1")),
  date = as.POSIXct(date)) %>% 
 group_by(treated2, date) %>%
 summarise(
  prop = sum(v169 == "1") / n()) %>%
 spread(treated2, prop) %>%
 mutate(propdiff = `1` - `0`) %>% #use backticks here
 ggplot(aes(date, propdiff)) +
 geom_line() + 
 geom_point()
  

警告:1:删除了包含缺失值的两行(geom_path)。   2:删除了包含缺失值的7行(geom_point)。

enter image description here

在此答案中出现缺失值的情况下,使用geom_line连接点的一种方法:https://stackoverflow.com/a/9641380/8583393

df <- Dataframe %>%
 mutate(
  treated2 = factor(treated, levels = c("0", "1")),
  date = as.POSIXct(date)) %>% 
 group_by(treated2, date) %>%
 summarise(
  prop = sum(v169 == "1") / n()) %>%
  spread(treated2, prop) %>%
 mutate(propdiff = `1` - `0`)

df %>% 
 ggplot(aes(date, propdiff)) +
 geom_line(data = df[!is.na(df$propdiff), ]) + 
 geom_point()

enter image description here