我不是统计学家,但我确实希望使用基本概率来了解我的数据发生了什么。
我创建了一种繁琐但非常有用的方法,即使用直方图查看特定区域中的数据,然后将我感兴趣分析的不同组别与整个组进行比较。它向我们展示了我们公司的一些令人难以置信的见解,并且很容易解释图表中发生了什么。虽然这说起来非常繁琐,但这种类型的分析非常有用,以至于其他人已经为它创造了一个功能。
以下是我的代码。这种类型的分析是否已存在于函数中?我也使用了logi.hist.plot(),它做了类似的事情,但它可能有问题,我更喜欢使用这个"原始视图"的数据。
library(dplyr)
library(ggplot2)
#Create the data
set.seed(84102)
daba <- data.frame(YES_NO = c(0,0,1,1,1,1,0,0,0,1,0,1,0,1,0,1,0,0,0,1))
daba$UserCount <- c(23,43,45,65,32,10,34,68,65,75,43,24,37,54,73,29,87,32,21,12)
#Create the bins using hist(), clean up bins and make them integers
hist_breaks <- cut(daba$UserCount, breaks = hist(daba$UserCount, breaks = 20)$breaks)
daba$Breaks <- hist_breaks
daba$Breaks <- sub(".*,","",daba$Breaks)
daba$Breaks <- sub("]","",daba$Breaks)
daba$Breaks[is.na(daba$Breaks)] <- 0
daba$Breaks <- as.integer(daba$Breaks)
#Create two data groups to be compared
daba_NO <- filter(daba, daba$YES_NO == 0)
daba_YES <- filter(daba, daba$YES_NO == 1)
#Aggregate user count into histogram bins using aggregate()
daba_NOAgg <- aggregate(data = daba_NO, daba_NO$Breaks~daba_NO$UserCount, sum)
daba_YESAgg <- aggregate(data = daba_YES, daba_YES$Breaks~daba_YES$UserCount, sum)
#Rename the columns to clean it up
colnames(daba_NOAgg) <- c("UserCountNo", "Breaks")
colnames(daba_YESAgg) <- c("UserCountYes", "Breaks")
#Merge the two groups back together
daba_SUMAgg <- merge(x = daba_NOAgg, y = daba_YESAgg, by.x = "Breaks", by.y = "Breaks")
#Generate basic probability for Yes group of users
daba_SUMAgg$Probability <- (daba_SUMAgg$UserCountYes/(daba_SUMAgg$UserCountNo+daba_SUMAgg$UserCountYes))*100
#Graph the data
ggplot(data = daba_SUMAgg)+
geom_point(alpha = 0.4, mapping = aes(y = daba_SUMAgg$Probability, x = daba_SUMAgg$Breaks))+
labs( x = "BINS", y = "PROBABILITY", title = "PROBABILITY ANALYSIS USING BINS")
daba_SUMAgg
答案 0 :(得分:0)
group_by
dplyr
时,无需拆分数据集。无需从您的绘图范围创建数值。我认为你的过程错过了一些东西(见上面的评论)。
我建议使用
library(dplyr)
library(ggplot2)
#Create the data
set.seed(84102)
daba <- data.frame(YES_NO = c(0,0,1,1,1,1,0,0,0,1,0,1,0,1,0,1,0,0,0,1))
daba$UserCount <- c(23,43,45,65,32,10,34,68,65,75,43,24,37,54,73,29,87,32,21,12)
daba %>%
mutate(Breaks = cut(UserCount, breaks = hist(UserCount, breaks = 20)$breaks, right = F)) %>% # create your breaks (use right = F other wise you miss the value 10)
group_by(Breaks, YES_NO) %>% # for every range and YES_NO value
summarise(UserCount = sum(UserCount)) %>% # get sum of counts
mutate(Prc = UserCount/sum(UserCount)) %>% # get the probability / percentage
ungroup() %>% # forget the grouping
mutate(YES_NO = factor(YES_NO)) %>% # change this to factor for the plot
ggplot(aes(Breaks, Prc, col=YES_NO, group=YES_NO)) + # plot
geom_point() +
geom_line()
逐步运行管道处理以查看数据处理的工作原理以及数据集在绘制之前的外观。