使用两个属性组合频率直方图

时间:2016-02-07 15:18:11

标签: r ggplot2 histogram

我使用ggplot2为两个不同的参数创建直方图。我当前的方法附在我的问题的最后(包括一个数据集,可以直接从pasetbin.com使用和加载),这将创建

  1. 根据" location" -attribute(" WITHIN"或" NOT_WITHIN")可视化记录的用户数据空间分布的频率的组织图。
  2. 直方图,根据" context" -attribute(" Clicked A"或" Clicked B")可视化记录用户数据的分配频率
  3. 这看起来像下面的: enter image description here

    # Load my example dataset from pastebin
    RawDataSet <- read.csv("http://pastebin.com/raw/uKybDy03", sep=";")
    # Load packages
    library(plyr)
    library(dplyr)
    library(reshape2)
    library(ggplot2)
    
    ###### Create Frequency Table for Location-Information
    LocationFrequency <- ddply(RawDataSet, .(UserEmail), summarize, 
                               All = length(UserEmail),
                               Within_area = sum(location=="WITHIN"),
                               Not_within_area = sum(location=="NOT_WITHIN"))
    # Create a column for unique identifiers
    LocationFrequency <- mutate(LocationFrequency, id = rownames(LocationFrequency))
    # Reorder columns
    LocationFrequency <- LocationFrequency[,c(5,1:4)]
    # Format id-column as numbers (not as string)
    LocationFrequency[,c(1)] <- sapply(LocationFrequency[, c(1)], as.numeric)
    # Melt data
    LocationFrequency.m = melt(LocationFrequency, id.var=c("UserEmail","All","id"))
    # Plot data
    p <- ggplot(LocationFrequency.m, aes(x=id, y=value, fill=variable)) +
      geom_bar(stat="identity") +
      theme_grey(base_size = 16)+
      labs(title="Histogram showing the distribution of all spatial information per user.") + 
      labs(x="User", y="Number of notifications interaction within/not within the area") +
      # using IDs instead of UserEmail
      scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30), labels=c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30"))
    # Change legend Title
    p + labs(fill = "Type of location")
    
    
    
    ##### Create Frequency Table for Interaction-Information
    InterationFrequency <- ddply(RawDataSet, .(UserEmail), summarize, 
                                 All = length(UserEmail),
                                 Clicked_A = sum(context=="Clicked A"),
                                 Clicked_B = sum(context=="Clicked B"))
    # Create a column for unique identifiers
    InterationFrequency <- mutate(InterationFrequency, id = rownames(InterationFrequency))
    # Reorder columns
    InterationFrequency <- InterationFrequency[,c(5,1:4)]
    # Format id-column as numbers (not as string)
    InterationFrequency[,c(1)] <- sapply(InterationFrequency[, c(1)], as.numeric)
    # Melt data
    InterationFrequency.m = melt(InterationFrequency, id.var=c("UserEmail","All","id"))
    # Plot data
    p <- ggplot(InterationFrequency.m, aes(x=id, y=value, fill=variable)) +
      geom_bar(stat="identity") +
      theme_grey(base_size = 16)+
      labs(title="Histogram showing the distribution of all interaction types per user.") + 
      labs(x="User", y="Number of interaction") +
      # using IDs instead of UserEmail 
      scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30), labels=c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30"))
      # Change legend Title
      p + labs(fill = "Type of interaction")
    

    但是我想要实现的目标:我如何只在一个情节中结合两个直方图?是否可能以某种方式为每个部分设置相应的百分比?有些想法如下图所示,它表示每个用户的观察总数(条的完整高度),并使用不同的分段来显示相应的数据。每个栏将分为几个部分( not_within 中),然后将每个部分分成两个子部分,显示交互类型的百分比(*点击A&#39 ;或点击B )。

    sketch

1 个答案:

答案 0 :(得分:3)

通过更新说明,我将制作一个包含两个部分的组合条形图:负片和正片。为了实现这一目标,您必须使数据格式正确:

# load needed libraries
library(dplyr)
library(tidyr)
library(ggplot2)

# summarise your data
new.df <- RawDataSet %>% 
  group_by(UserEmail,location,context) %>% 
  tally() %>%
  mutate(n2 = n * c(1,-1)[(location=="NOT_WITHIN")+1L]) %>%
  group_by(UserEmail,location) %>%
  mutate(p = c(1,-1)[(location=="NOT_WITHIN")+1L] * n/sum(n))

new.df数据框如下所示:

> new.df
Source: local data frame [90 x 6]
Groups: UserEmail, location [54]

   UserEmail   location   context     n    n2          p
      (fctr)     (fctr)    (fctr) (int) (dbl)      (dbl)
1      andre NOT_WITHIN Clicked A     3    -3 -1.0000000
2       bibi NOT_WITHIN Clicked A     4    -4 -0.5000000
3       bibi NOT_WITHIN Clicked B     4    -4 -0.5000000
4       bibi     WITHIN Clicked A     9     9  0.6000000
5       bibi     WITHIN Clicked B     6     6  0.4000000
6     corinn NOT_WITHIN Clicked A    10   -10 -0.5882353
7     corinn NOT_WITHIN Clicked B     7    -7 -0.4117647
8     corinn     WITHIN Clicked A     9     9  0.7500000
9     corinn     WITHIN Clicked B     3     3  0.2500000
10  dpfeifer NOT_WITHIN Clicked A     7    -7 -1.0000000
..       ...        ...       ...   ...   ...        ...

接下来,您可以使用以下内容创建绘图:

ggplot() +
  geom_bar(data = new.df[new.df$location == "NOT_WITHIN",],
           aes(x = UserEmail, y = n2, color = "darkgreen", fill = context),
           size = 1, stat = "identity", width = 0.7) +
  geom_bar(data = new.df[new.df$location == "WITHIN",],
           aes(x = UserEmail, y = n2, color = "darkred", fill = context),
           size = 1, stat = "identity", width = 0.7) +
  scale_y_continuous(breaks = seq(-20,20,5),
                     labels = c(20,15,10,5,0,5,10,15,20)) +
  scale_color_manual("Location of interaction",
                     values = c("darkgreen","darkred"),
                     labels = c("NOT_WITHIN","WITHIN")) +
  scale_fill_manual("Type of interaction",
                    values = c("lightyellow","lightblue"),
                    labels = c("Clicked A","Clicked B")) +
  guides(color = guide_legend(override.aes = list(color = c("darkred","darkgreen"),
                                                  fill = NA, size = 2), reverse = TRUE),
         fill = guide_legend(override.aes = list(fill = c("lightyellow","lightblue"),
                                                 color = "black", size = 0.5))) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
        axis.title = element_blank(),
        legend.title = element_text(face = "italic", size = 14),
        legend.key.size = unit(1, "lines"),
        legend.text = element_text(size = 11))

导致:

enter image description here

如果您想使用百分比值,可以使用p - 列制作情节:

ggplot() +
  geom_bar(data = new.df[new.df$location == "NOT_WITHIN",],
           aes(x = UserEmail, y = p, color = "darkgreen", fill = context),
           size = 1, stat = "identity", width = 0.7) +
  geom_bar(data = new.df[new.df$location == "WITHIN",],
           aes(x = UserEmail, y = p, color = "darkred", fill = context),
           size = 1, stat = "identity", width = 0.7) +
  scale_y_continuous(breaks = c(-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1),
                     labels = scales::percent(c(1,0.75,0.5,0.25,0,0.25,0.5,0.75,1))) +
  scale_color_manual("Location of interaction",
                     values = c("darkgreen","darkred"),
                     labels = c("NOT_WITHIN","WITHIN")) +
  scale_fill_manual("Type of interaction",
                    values = c("lightyellow","lightblue"),
                    labels = c("Clicked A","Clicked B")) +
  coord_flip() +
  guides(color = guide_legend(override.aes = list(color = c("darkred","darkgreen"),
                                                  fill = NA, size = 2), reverse = TRUE),
         fill = guide_legend(override.aes = list(fill = c("lightyellow","lightblue"),
                                                 color = "black", size = 0.5))) +
  theme_minimal(base_size = 14) +
  theme(axis.title = element_blank(),
        legend.title = element_text(face = "italic", size = 14),
        legend.key.size = unit(1, "lines"),
        legend.text = element_text(size = 11))

导致:

enter image description here

回复评论

如果要将文本标签放在条形图内,则还必须计算位置变量:

new.df <- RawDataSet %>% 
  group_by(UserEmail,location,context) %>% 
  tally() %>%
  mutate(n2 = n * c(1,-1)[(location=="NOT_WITHIN")+1L]) %>%
  group_by(UserEmail,location) %>%
  mutate(p = c(1,-1)[(location=="NOT_WITHIN")+1L] * n/sum(n),
         pos = (context=="Clicked A")*p/2 + (context=="Clicked B")*(c(1,-1)[(location=="NOT_WITHIN")+1L] * (1 - abs(p)/2)))

然后在ggplot之后的geom_bar代码中添加以下行:

geom_text(data = new.df, aes(x = UserEmail, y = pos, label = n))

导致:

enter image description here

您也可以使用label = n来显示百分比,而不是label = scales::percent(abs(p))