Question

我有一个看起来像这样的数据框

> moviesInMostPopGenres[1, ]
      drama comedy short romance action crime thriller documentary
1         0      0     0       1      0     0        0           1         
  Runtime
1      70

我想要制作的是一个堆积的盒子图，在y轴上我们有类型，在x轴上我们有运行时间。

如果Genre是像这样的单一栏目，我可以做类似的事情

ggplot(df, aes(as.factor(Genre), Runtime)) + geom_boxplot() +
coord_flip() +
scale_x_discrete("Genre")

但困难在于每一行都可以有多个类型的值，并且这些类型分布在指标变量上。

在上面的例子中，电影将属于“浪漫”发行版以及“纪录片”发行版，其运行时间均为70。

Answer 1

这是一种使用ggplot2的方法，它通过流派分组（使用@neilfws提供的数据集）以及流派标记生成箱形图，并根据流派设置计算条形图：

library(ggplot2)
library(ggstance)
library(dplyr)
library(tidyr)
library(egg)  # devtools::install.github("baptiste/egg") 

# Add a data column listing the genres to which each movie belongs.
# This will become the y-coordinate for all of the plots.
movies$set = apply(movies %>% select(Drama:Documentary), 1, function(x) {
  paste(sort(names(x)[which(x == 1)]), collapse="-")
})

# Add a "None" genre
movies$None=ifelse(movies$set=="", 1, 0)
movies$set[movies$set==""] = "None"

# Get order of genre groupings by median run time
set_order = movies %>% group_by(set) %>% 
  summarise(med_run = median(Runtime),
            n = n()) %>% 
  arrange(med_run) %>% 
  mutate(set = factor(set, levels=set))

# Set order of genre groupings
movies = movies %>% 
  mutate(set = factor(set, levels=set_order$set))

现在创建并布置三个图：

# Set theme elements that will apply to all future plots
theme_set(theme_classic() + 
            theme(axis.title.y=element_blank(),
                  axis.text.y=element_blank()))

# Box plot of run times by set
p1 = ggplot(movies, aes(Runtime, set)) +
  geom_boxploth() +
  coord_cartesian(xlim=c(60,240)) +
  scale_x_continuous(breaks=seq(0,300,60)) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank(),
        plot.margin=margin(r=-2))

# Plot number of movies in each set
p2 = ggplot(set_order, aes(n, set)) +
  geom_barh(stat='identity') +
  geom_text(aes(label=ifelse(n>50,n,""), x=0.5*n), colour="white", size=2.5) +
  scale_x_continuous(expand=c(0,0), limits=c(0, 1.02*max(set_order$n))) +
  theme(plot.margin=margin(r=10),
        axis.line.y=element_line(colour="grey70"),
        axis.ticks.y=element_blank()) 

# Plot genres to which each set belongs
p3 = movies %>% 
  gather(key, value, Drama:Documentary) %>% 
  group_by(set, key, value) %>% 
  slice(1) %>% 
  ungroup %>% 
  mutate(key = factor(key)) %>% 
  ggplot(aes(key, set)) +
  geom_tile(aes(fill=factor(value)), colour="white", size=0.5) +
  geom_text(aes(label=ifelse(value==1, substr(key,1,2), "")), colour="white", size=2.3) +
  theme(axis.text.x=element_text(angle=-90, hjust=0, vjust=0.5),
        axis.title.x=element_blank()) +
  scale_fill_manual(values=c("grey90", "grey30")) +
  guides(fill=FALSE)

ggarrange(p1, p3, p2, ncol=3, widths=c(5,3,5))

如果您想更改顺序，例如，按照每个类型集中的电影数量进行排序，只需重置顺序：

# Get order of genre groupings by median run time
set_order = movies %>% group_by(set) %>% 
  summarise(med_run = median(Runtime),
            n = n()) %>% 
  arrange(n) %>% 
  mutate(set = factor(set, levels=set))

# Set order of genre groupings
movies = movies %>% 
  mutate(set = factor(set, levels=set_order$set))

然后运行与上面相同的绘图代码来获得：

Answer 2

对此的一个解决方案是UpSetR package。它附带了一个与你的非常相似的电影数据集，如this documentation所示。

library(UpSetR)
library(dplyr)

# load the example movies dataset
movies <- read.csv(system.file("extdata", "movies.csv", package = "UpSetR"), 
                   header=T, sep=";")

# make it look more like your example
# first column should be movie name, the rest attributes

movies <- movies %>% select(Name, Drama, Comedy, Romance, 
                            Action, Crime, Thriller, Documentary)

# Add a Runtime column
set.seed(123)
movies <- movies %>% 
  mutate(Runtime = sample(60:240, nrow(.), replace = TRUE))

# upset
upset(movies, boxplot.summary = c("Runtime"), nsets = 7)

顶部条形图显示交叉点及其大小。下面是一个表示交叉类型的图表。下面是每个交叉点的Runtime的箱线图。

来自指标变量的堆积框图

2 个答案: