我正在为某些数据进行一些质量检查/质量控制,并且对查看每个组中被标记为可能更容易出错的数据所占的百分比感兴趣。我有15个小组要对此进行迭代,但是我不确定如何做到最好。也许是for循环?
# pulling out group 1 data
group_1 <- filter(flow_group_df, GROUP == 1)
#looking at number of flagged occurrences in group 1
group_1_flagged <- length(which(group_1 == "flagged"))
#total number of checked occurrences that havent been flagged
class_1_checked <- length(which(group_1 == "checked"))
答案 0 :(得分:0)
考虑使用R stats
进行完全聚合的内置aggregate
库:
agg_df <- aggregate(num_col ~ GROUP + group_1, flow_group_df, length)
或使用ave
进行内联聚合:
flow_group_df$group_1_count <- with(flow_group_df, ave(num_col, GROUP, group_1, FUN=length))
使用随机数据进行演示:
set.seed(72318)
flow_group_df <- data.frame(GROUP = c("julia", "r", "pandas"),
group_1 = sample(c("flagged", "checked"), 60, replace=TRUE),
num_col = runif(60, 0, 100))
汇总
agg_df <- aggregate(num_col ~ GROUP + group_1, flow_group_df, length)
agg_df <- with(agg_df, agg_df[order(GROUP, group_1),]) # ORDER BY GROUPS
row.names(agg_df) <- NULL # RESET ROW NAMES
colnames(agg_df)[3] <- "count" # RENAME KEY COL
agg_df
# GROUP group_1 count
# 1 julia checked 10
# 2 julia flagged 10
# 3 pandas checked 8
# 4 pandas flagged 12
# 5 r checked 7
# 6 r flagged 13
平均水平(数量和百分比计算)
flow_group_df$group_1_count <- with(flow_group_df, ave(num_col, GROUP, group_1, FUN=length))
flow_group_df$group_1_pct <- with(flow_group_df, ave(num_col, GROUP, group_1, FUN=length)) /
with(flow_group_df, ave(num_col, GROUP, FUN=length))
flow_group_df <- with(flow_group_df, flow_group_df[order(GROUP, group_1),]) # ORDER BY GROUPS
row.names(flow_group_df) <- NULL # RESET ROW NAMES
tail(flow_group_df, 20)
# GROUP group_1 num_col group_1_count group_1_pct
# 41 r checked 8.128056 7 0.35
# 42 r checked 86.439911 7 0.35
# 43 r checked 75.488474 7 0.35
# 44 r checked 88.120510 7 0.35
# 45 r checked 43.058268 7 0.35
# 46 r checked 46.662674 7 0.35
# 47 r checked 42.329505 7 0.35
# 48 r flagged 94.959380 13 0.65
# 49 r flagged 64.817015 13 0.65
# 50 r flagged 61.118952 13 0.65
# 51 r flagged 69.104977 13 0.65
# 52 r flagged 98.078729 13 0.65
# 53 r flagged 74.857959 13 0.65
# 54 r flagged 83.813440 13 0.65
# 55 r flagged 99.069011 13 0.65
# 56 r flagged 62.298414 13 0.65
# 57 r flagged 14.335920 13 0.65
# 58 r flagged 70.404048 13 0.65
# 59 r flagged 18.744892 13 0.65
# 60 r flagged 21.598072 13 0.65