我有一个数据集,并希望在建立预测模型之前进行一些探索性数据分析。所有变量都是分类的。我知道我可以使用'dataExplorer'进行一些快速的EDA:
library(tidyverse)
library(dataExplorer)
dat <- data.frame(circuit = sample(c("China", "Murica", "Brazil"), 100, replace = T),
driver = sample(c("Kimi", "Seb", "Max", "Lando", "Lance"), 100, replace = T),
opinion = sample(c("Garbage", "Not.Garbage"), 100, replace = T, prob = c(0.8, 0.2)))
dat %>%
select(-opinion) %>%
plot_bar
但是,我希望填写“电路”和“驱动程序”的条形,以表示每个变量的“意见”比例(请参见下文)。这样一来,我可以看到哪些预测变量与我的结果变量最紧密相关。
dat %>%
ggplot(aes(x = circuit, fill = opinion)) +
geom_histogram(stat = "count")
但是,我不想单独构建每个图,然后使用grid.arrange来组织它们。
感谢您的帮助:)
答案 0 :(得分:1)
除非您调整plot_bar
函数,否则我认为没有一种简便的方法,因为它目前旨在可视化单变量分布。运行以下功能,它应适用于您的示例:
library(tidyverse)
library(data.table) ## Note: You will need to load data.table
library(DataExplorer)
## Rewrite plot_bar
plot_bar2 <- function(data, group, with = NULL, maxcat = 50, order_bar = TRUE, binary_as_factor = TRUE, title = NULL, ggtheme = theme_gray(), theme_config = list(), nrow = 3L, ncol = 3L, parallel = FALSE) {
frequency <- measure <- variable <- value <- NULL
if (!is.data.table(data)) data <- data.table(data)
split_data <- split_columns(data, binary_as_factor = binary_as_factor)
if (split_data$num_discrete == 0) stop("No discrete features found!")
discrete <- split_data$discrete
ind <- DataExplorer:::.ignoreCat(discrete, maxcat = maxcat)
if (length(ind)) {
message(length(ind), " columns ignored with more than ", maxcat, " categories.\n", paste0(names(ind), ": ", ind, " categories\n"))
drop_columns(discrete, names(ind))
if (length(discrete) == 0) stop("Note: All discrete features ignored! Nothing to plot!")
}
feature_names <- names(discrete)
if (is.null(with)) {
dt <- discrete[, list(frequency = .N), by = feature_names]
} else {
if (is.factor(data[[with]])) {
measure_var <- suppressWarnings(as.numeric(levels(data[[with]]))[data[[with]]])
} else if (is.character(data[[with]])) {
measure_var <- as.numeric(data[[with]])
} else {
measure_var <- data[[with]]
}
if (all(is.na(measure_var))) stop("Failed to convert `", with, "` to continuous!")
if (with %in% names(discrete)) drop_columns(discrete, with)
tmp_dt <- data.table(discrete, "measure" = measure_var)
dt <- tmp_dt[, list(frequency = sum(measure, na.rm = TRUE)), by = feature_names]
}
dt2 <- suppressWarnings(melt.data.table(dt, id.vars = c(group, "frequency"), measure.vars = setdiff(feature_names, group))) # This line is updated
layout <- DataExplorer:::.getPageLayout(nrow, ncol, ncol(discrete))
plot_list <- DataExplorer:::.lapply(
parallel = parallel,
X = layout,
FUN = function(x) {
if (order_bar) {
base_plot <- ggplot(dt2[variable %in% feature_names[x]], aes(x = reorder(value, frequency), y = frequency))
} else {
base_plot <- ggplot(dt2[variable %in% feature_names[x]], aes(x = value, y = frequency))
}
base_plot +
geom_bar(stat = "identity", aes_string(fill = group)) + # This line is updated
coord_flip() +
xlab("") + ylab(ifelse(is.null(with), "Frequency", toTitleCase(with)))
}
)
class(plot_list) <- c("multiple", class(plot_list))
plotDataExplorer(
plot_obj = plot_list,
page_layout = layout,
title = title,
ggtheme = ggtheme,
theme_config = theme_config,
facet_wrap_args = list(
"facet" = ~ variable,
"nrow" = nrow,
"ncol" = ncol,
"scales" = "free"
)
)
}
## Create data and plot
dat <- data.frame(
circuit = sample(c("China", "Murica", "Brazil"), 100, replace = T),
driver = sample(c("Kimi", "Seb", "Max", "Lando", "Lance"), 100, replace = T),
opinion = sample(c("Garbage", "Not.Garbage"), 100, replace = T, prob = c(0.8, 0.2))
)
plot_bar2(dat, group = "opinion")
情节看起来像这样: