我有一个看起来像这样的数据框
> moviesInMostPopGenres[1, ]
drama comedy short romance action crime thriller documentary
1 0 0 0 1 0 0 0 1
Runtime
1 70
我想要制作的是一个堆积的盒子图,在y轴上我们有类型,在x轴上我们有运行时间。
如果Genre是像这样的单一栏目,我可以做类似的事情
ggplot(df, aes(as.factor(Genre), Runtime)) + geom_boxplot() +
coord_flip() +
scale_x_discrete("Genre")
但困难在于每一行都可以有多个类型的值,并且这些类型分布在指标变量上。
在上面的例子中,电影将属于“浪漫”发行版以及“纪录片”发行版,其运行时间均为70。
答案 0 :(得分:2)
这是一种使用ggplot2的方法,它通过流派分组(使用@neilfws提供的数据集)以及流派标记生成箱形图,并根据流派设置计算条形图:
library(ggplot2)
library(ggstance)
library(dplyr)
library(tidyr)
library(egg) # devtools::install.github("baptiste/egg")
# Add a data column listing the genres to which each movie belongs.
# This will become the y-coordinate for all of the plots.
movies$set = apply(movies %>% select(Drama:Documentary), 1, function(x) {
paste(sort(names(x)[which(x == 1)]), collapse="-")
})
# Add a "None" genre
movies$None=ifelse(movies$set=="", 1, 0)
movies$set[movies$set==""] = "None"
# Get order of genre groupings by median run time
set_order = movies %>% group_by(set) %>%
summarise(med_run = median(Runtime),
n = n()) %>%
arrange(med_run) %>%
mutate(set = factor(set, levels=set))
# Set order of genre groupings
movies = movies %>%
mutate(set = factor(set, levels=set_order$set))
现在创建并布置三个图:
# Set theme elements that will apply to all future plots
theme_set(theme_classic() +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank()))
# Box plot of run times by set
p1 = ggplot(movies, aes(Runtime, set)) +
geom_boxploth() +
coord_cartesian(xlim=c(60,240)) +
scale_x_continuous(breaks=seq(0,300,60)) +
theme(axis.line.y=element_blank(),
axis.ticks.y=element_blank(),
plot.margin=margin(r=-2))
# Plot number of movies in each set
p2 = ggplot(set_order, aes(n, set)) +
geom_barh(stat='identity') +
geom_text(aes(label=ifelse(n>50,n,""), x=0.5*n), colour="white", size=2.5) +
scale_x_continuous(expand=c(0,0), limits=c(0, 1.02*max(set_order$n))) +
theme(plot.margin=margin(r=10),
axis.line.y=element_line(colour="grey70"),
axis.ticks.y=element_blank())
# Plot genres to which each set belongs
p3 = movies %>%
gather(key, value, Drama:Documentary) %>%
group_by(set, key, value) %>%
slice(1) %>%
ungroup %>%
mutate(key = factor(key)) %>%
ggplot(aes(key, set)) +
geom_tile(aes(fill=factor(value)), colour="white", size=0.5) +
geom_text(aes(label=ifelse(value==1, substr(key,1,2), "")), colour="white", size=2.3) +
theme(axis.text.x=element_text(angle=-90, hjust=0, vjust=0.5),
axis.title.x=element_blank()) +
scale_fill_manual(values=c("grey90", "grey30")) +
guides(fill=FALSE)
ggarrange(p1, p3, p2, ncol=3, widths=c(5,3,5))
如果您想更改顺序,例如,按照每个类型集中的电影数量进行排序,只需重置顺序:
# Get order of genre groupings by median run time
set_order = movies %>% group_by(set) %>%
summarise(med_run = median(Runtime),
n = n()) %>%
arrange(n) %>%
mutate(set = factor(set, levels=set))
# Set order of genre groupings
movies = movies %>%
mutate(set = factor(set, levels=set_order$set))
然后运行与上面相同的绘图代码来获得:
答案 1 :(得分:1)
对此的一个解决方案是UpSetR package。它附带了一个与你的非常相似的电影数据集,如this documentation所示。
library(UpSetR)
library(dplyr)
# load the example movies dataset
movies <- read.csv(system.file("extdata", "movies.csv", package = "UpSetR"),
header=T, sep=";")
# make it look more like your example
# first column should be movie name, the rest attributes
movies <- movies %>% select(Name, Drama, Comedy, Romance,
Action, Crime, Thriller, Documentary)
# Add a Runtime column
set.seed(123)
movies <- movies %>%
mutate(Runtime = sample(60:240, nrow(.), replace = TRUE))
# upset
upset(movies, boxplot.summary = c("Runtime"), nsets = 7)
顶部条形图显示交叉点及其大小。下面是一个表示交叉类型的图表。下面是每个交叉点的Runtime
的箱线图。