Question

我有以下数据集，其中包含以下变量，用于指示某人是否使用过电话（虚拟变量，其中1 =使用过电话（“是”），其他为0（“否”））；他们的ID以及他们居住的地区和分区。请注意，同一个人在不同的分区下可能被记录两次或多次。但是，我只想算一次这样的人，也就是说，只考虑唯一的ID。

district sub_district   id  used_phone
    A   SX  1   Yes
    A   SX  2   Yes
    A   SX  3   No
    A   SX  4   No
    A   SY  4   No
    A   SY  5   Yes
    A   SZ  6   Yes
    A   SX  6   Yes
    A   SZ  7   No
    B   RX  8   No
    B   RV  9   No
    B   RX  9   No
    B   RV  10  Yes
    B   RV  11  Yes
    B   RT  12  Yes
    B   RT  13  Yes
    B   RV  13  Yes
    B   RT  14  No
    B   RX  14  No

N.B：used_phone是一个因子变量

对于上述数据集，我想绘制一个我正在使用以下代码的“是否有人使用过电话”的分布：

  ggplot(df, aes(x=used_phone)) +
  geom_bar(color = "black", fill = "aquamarine4", position = "dodge") +
  labs(x="Used phone", y = "Number of people") +
  ggtitle("Whether person used phone") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5)))

此代码可以正常工作。但是，我想做两件事：

为每个组的每个标签（是和否）添加％标签，但在y轴上显示“计数”
绘制图形，使其仅考虑唯一ID

我是R的新手，期待在您的帮助下解决此问题。

谢谢，拉奇塔

Answer 1

这是一个可行的建议：

根据used_phone汇总您的df，并统计使用过电话或未使用电话的总人数。
基于汇总计数，您可以计算百分比份额，并可以添加label克隆，该数字只是带有%号的百分比份额
您可以使用ggplot和新汇总的df进行绘图。您可以使用geom_text()在条形图的顶部添加百分比标签，使用vjust中的position_stack()参数来调整标签的位置。

df %>% 
distinct(.keep_all = T) %>%
  group_by(used_phone) %>% 
  summarize(n()) %>% 
  setNames(., c('used_phone', 'count')) %>% 
  mutate('share' = count/sum(count),
         'label' = paste0(round(share*100, 2), '%')) -> df

  ggplot(df, aes(y=count, x=used_phone)) +
  geom_bar(stat='identity',
           color = "black", 
           fill = "aquamarine4", 
           position = "dodge") +
  geom_text(aes(label = label),
            position = position_stack(vjust = 1.02),
            size = 3) +
  labs(title = 'Whether person used phone',
       x = 'Used Phone',
       y = 'Number of People') +
  theme_bw()

Answer 2

由于 id 中的重复项同时位于不同的sub_district中，并且您不想重复计算它们，因此我删除了变量sub_district。然后清除所有重复项，对电话进行计数并计算百分比。显示了来自此的DF。 ggplot用geom_col表示，轴上的百分比用scales。

我已注释掉两行代码，这些代码使您可以在ggplot中进行分区。随之而来的图表附在底部。

library(tidyverse)

df <- read.table(text="district sub_district   id  used_phone
    A   SX  1   Yes
    A   SX  2   Yes
    A   SX  3   No
    A   SX  4   No
    A   SY  4   No
    A   SY  5   Yes
    A   SZ  6   Yes
    A   SX  6   Yes
    A   SZ  7   No
    B   RX  8   No
    B   RV  9   No
    B   RX  9   No
    B   RV  10  Yes
    B   RV  11  Yes
    B   RT  12  Yes
    B   RT  13  Yes
    B   RV  13  Yes
    B   RT  14  No
    B   RX  14  No", header = T)
table(df$used_phone)
#> 
#>  No Yes 
#>   9  10

ddf <- df %>%
  select(-sub_district) %>%        # delete sub_district
  distinct(id, .keep_all = T) %>%  # unique id`s`
  #group_by(district) %>% 
  count(used_phone) %>%            # cout phones
  mutate(pct = n / sum(n))         # calculate percentage

ddf
#> # A tibble: 2 x 3
#>   used_phone     n   pct
#>   <chr>      <int> <dbl>
#> 1 No             6 0.429
#> 2 Yes            8 0.571

ggplot(ddf, aes(used_phone, pct, fill = used_phone)) +
  geom_col(position = 'dodge') + 
  #facet_wrap(~district) +
  scale_fill_manual(values = c("aquamarine4", "aquamarine3")) +
  scale_y_continuous(labels = scales::percent_format())

基于注释的新添加：

想要计数的y轴
希望将百分比作为条形上的标签
想成为地区的一面

ddf <- df %>%
  select(-sub_district) %>%        # delete sub_district
  distinct(id, .keep_all = T) %>%  # unique id`s`
  group_by(district) %>% 
  count(used_phone) %>%            # cout phones
  mutate(pct = n / sum(n),         # calculate percentage
         label = paste0(round(pct*100, 2), '%'))     

ggplot(ddf, aes(used_phone, n, fill = used_phone)) +
  geom_col(position = 'dodge') + 
  facet_wrap(~district) +
  scale_fill_manual(values = c("aquamarine4", "aquamarine3")) +
  geom_text(aes(label = label),
           position = position_stack(vjust = 1.05),
           size = 3) +
  labs(y='count')

*新增* 更改百分比基础

ddf <- df %>%
  select(-sub_district) %>%        # delete sub_district
  distinct(id, .keep_all = T) %>%  # unique id`s`
  mutate(ssum = n()) %>% 
  group_by(district) %>% 
  count(used_phone, ssum) %>%            # cout phones
  mutate(pct = n / ssum,         # calculate percentage
         label = paste0(round(pct*100, 2), '%'))

我引入了一个新变量，该变量在分组之前将数字加起来。这给出了：

在条形图中添加百分比标签（gglot2）

2 个答案: