我希望获得一些帮助来确定:
我的df:
曲棍球运动员在整个赛季练习期间的数值负荷数据
有时候,即使玩家进行了练习,加速度计也无法正常工作。因此,为了确保我们仍然可以跟踪他们在练习中所做的工作,我想在练习中从他们的位置(前锋,防守或守门员)中插入平均值。 (即,如果守门员的加速度计不起作用,我想取其他守门员的平均负荷并将其插入该球员的观察行进行练习)。
> head(DummyLoads)
Name Date Load Position
1 Jim 2019-10-19 900 2.100 Forward
2 Bob 2019-10-19 900 2.100 Forward
3 Dave 2019-10-19 900 2.100 Forward
4 Steve 2019-10-19 850 2.312 Forward
5 Fred 2019-10-19 850 2.312 Defense
6 Ray 2019-10-19 850 2.312 Defense
DummyLoads <- structure(list(Name = structure(c(4L, 1L, 2L, 6L, 3L, 5L, 4L, 1L, 2L, 3L, 5L, 4L, 1L, 2L, 6L, 3L, 5L, 2L, 6L, 3L, 5L),
.Label = c("Bob", "Dave", "Fred", "Jim", "Ray", "Steve"),
class = "factor"),
Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L),
.Label = c("2019-10-19", "2019-10-20", "2019-10-21", "2019-10-22"), class = "factor"),
Load = c(900L, 900L, 900L, 850L, 850L, 850L, 789L, 789L, 789L, 960L, 960L, 909L, 909L, 909L, 991L, 991L, 991L, 720L, 717L, 717L, 717L),
Load.Min = c(2.1, 2.1, 2.1, 2.312, 2.312, 2.312, 2.22, 2.22, 2.22, 2, 2, 1.88, 1.88, 1.88, 1.99, 1.99, 1.99, 2.1, 2.3, 2.3, 2.3), Position = structure(c(2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L),
.Label = c("Defense", "Forward"), class = "factor")), class = "data.frame", row.names = c(NA, -21L))
ggplot(DummyLoads, aes(x = Name, y = Load, fill = Position))+
geom_bar(stat = "identity")+
facet_grid(~Date)
以下是显示缺少数据的玩家的图表。
理想情况下,我希望能够识别那些缺失的数据点,而不必先绘制它。我还希望避免每次都要手动计算均值然后再输入。希望找到一个自动化的解决方案,因为我将有一个完整的实践季节来执行此操作,但是请理解,这可能很棘手!
在此先感谢您的任何建议。如果我没有清楚说明问题,我深表歉意。
实际df:
structure(list(Athlete = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L, 14L,
14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 18L, 18L,
18L, 18L, 19L, 19L, 19L, 19L, 20L, 20L, 20L, 20L, 21L, 21L, 21L,
21L, 22L, 22L, 22L, 23L, 23L, 23L, 23L, 24L, 24L, 24L, 24L, 25L,
25L, 25L, 25L, 26L, 26L, 26L, 26L, 27L, 27L, 27L, 27L, 28L, 28L,
28L), .Label = c("Agosta", "Ambrose", "Bach", "Bettez", "Clark",
"Daoust", "Desbiens", "Eldridge", "Fast", "Fortino", "Gabel",
"Jenner", "Johnston", "Lacasse", "Lacquette", "Larocque", "Leslie",
"Maschmeyer", "Mikkelson", "Nurse", "Poulin", "Pozzebon", "Rattray",
"Rougeau", "Saulnier", "Stacey", "Tiley", "Turnbull"), class = "factor"),
Date = structure(c(18170, 18171, 18172, 18169, 18170, 18171,
18172, 18170, 18171, 18172, 18169, 18170, 18171, 18172, 18169,
18170, 18171, 18172, 18169, 18170, 18171, 18170, 18171, 18172,
18169, 18170, 18171, 18172, 18169, 18170, 18171, 18172, 18169,
18170, 18171, 18172, 18169, 18170, 18171, 18172, 18170, 18171,
18172, 18169, 18170, 18171, 18172, 18169, 18170, 18171, 18172,
18170, 18171, 18172, 18170, 18171, 18172, 18169, 18170, 18171,
18172, 18169, 18170, 18171, 18172, 18169, 18170, 18171, 18172,
18169, 18170, 18171, 18172, 18169, 18170, 18171, 18172, 18170,
18171, 18172, 18169, 18170, 18171, 18172, 18169, 18170, 18171,
18172, 18169, 18170, 18171, 18172, 18169, 18170, 18171, 18172,
18169, 18170, 18171, 18172, 18170, 18171, 18172), class = "Date"),
Position = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 2L, 2L, 2L), .Label = c("Defense", "Forward", "Goalie"
), class = "factor"), PL_Avg = c(178.5, 123, 174, 191, 163.5,
169, 155, 158.5, 158.5, 172, 214, 169, 220.5, 175, 235, 191.5,
217.5, 145, 217, 184.5, 181, 135, 68, 104, 190, 127.5, 129,
136, 210, 194, 208.5, 168, 220, 189.5, 213, 192, 180, 204,
167.5, 187, 178, 196.5, 151, 204, 200, 170.5, 165, 107, 116,
100.5, 88, 189.5, 131.5, 178, 176, 200.5, 157, 315, 174,
182.5, 168, 100, 130.5, 114.5, 75, 205, 154.5, 198, 175,
163, 160.5, 200, 149, 185, 154, 149.5, 182, 177, 192, 161,
287, 179.5, 166, 157, 188, 163.5, 171.5, 153, 220, 111.5,
197, 110, 214, 188.5, 184, 173, 100, 103, 99, 64, 173.5,
137, 143), PL_Min_Avg = c(2.41, 2.68, 2.46, 2.21, 2.205,
2.4, 2.19, 2.075, 2.185, 2.08, 2.46, 2.22, 2.615, 2.48, 2.7,
2.5, 2.57, 2.06, 2.5, 2.42, 2.505, 1.515, 1.43, 1.75, 1.67,
1.675, 1.785, 1.92, 2.42, 2.645, 2.485, 2.38, 2.54, 2.6,
2.5, 2.72, 2.08, 2.56, 2.315, 2.27, 2.33, 2.325, 2.13, 2.36,
2.28, 2.355, 2.33, 1.38, 1.3, 1.19, 1.47, 2.56, 2.33, 2.52,
2.385, 2.39, 2.23, 2.58, 2.28, 2.525, 2.38, 1.41, 1.465,
1.325, 1.82, 2.36, 2.58, 2.38, 2.48, 1.88, 2.115, 2.37, 2.11,
2.13, 2.02, 2.075, 1.8, 2.395, 2.29, 2.27, 2.35, 2.345, 2.29,
2.23, 2.16, 2.225, 2.02, 2.17, 2.53, 2.115, 2.72, 2.16, 2.47,
2.465, 2.555, 2.45, 1.29, 1.145, 1.145, 1.57, 1.97, 1.82,
2.03)), row.names = c(NA, -103L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(Athlete = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L,
10L, 10L, 10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 13L, 13L,
13L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 17L,
17L, 17L, 18L, 18L, 18L, 18L, 19L, 19L, 19L, 19L, 20L, 20L, 20L,
20L, 21L, 21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 23L, 24L,
24L, 24L, 24L, 25L, 25L, 25L, 25L, 26L, 26L, 26L, 26L, 27L, 27L,
27L, 27L, 28L, 28L, 28L), .Label = c("Agosta", "Ambrose", "Bach",
"Bettez", "Clark", "Daoust", "Desbiens", "Eldridge", "Fast",
"Fortino", "Gabel", "Jenner", "Johnston", "Lacasse", "Lacquette",
"Larocque", "Leslie", "Maschmeyer", "Mikkelson", "Nurse", "Poulin",
"Pozzebon", "Rattray", "Rougeau", "Saulnier", "Stacey", "Tiley",
"Turnbull"), class = "factor"), Date = structure(c(18170, 18171,
18172, 18169, 18170, 18171, 18172, 18170, 18171, 18172, 18169,
18170, 18171, 18172, 18169, 18170, 18171, 18172, 18169, 18170,
18171, 18170, 18171, 18172, 18169, 18170, 18171, 18172, 18169,
18170, 18171, 18172, 18169, 18170, 18171, 18172, 18169, 18170,
18171, 18172, 18170, 18171, 18172, 18169, 18170, 18171, 18172,
18169, 18170, 18171, 18172, 18170, 18171, 18172, 18170, 18171,
18172, 18169, 18170, 18171, 18172, 18169, 18170, 18171, 18172,
18169, 18170, 18171, 18172, 18169, 18170, 18171, 18172, 18169,
18170, 18171, 18172, 18170, 18171, 18172, 18169, 18170, 18171,
18172, 18169, 18170, 18171, 18172, 18169, 18170, 18171, 18172,
18169, 18170, 18171, 18172, 18169, 18170, 18171, 18172, 18170,
18171, 18172), class = "Date"), .rows = list(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,
30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L,
42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L,
54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L,
66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L,
78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L,
90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 101L,
102L, 103L)), row.names = c(NA, -103L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
答案 0 :(得分:1)
已更新至当前问题:
DummyLoads <- DummyLoads %>%
ungroup()
full_data <- expand.grid(
Athlete = DummyLoads %>%
pull(Athlete) %>%
unique(),
Date = DummyLoads %>%
pull(Date) %>%
unique())
full_data %>%
# join incomplete data onto full data
left_join(DummyLoads, by = c("Athlete", "Date")) %>%
# assign the position to each player
# in the example data, some players do ahve different positions
# if this is true, than it would be unclear which average should be
# considered. Therefore, I assumed their position is constant
left_join(DummyLoads %>%
select(Athlete, Position) %>%
distinct(Athlete, .keep_all=TRUE),
by = "Athlete") %>%
# keep both to check the differences
rename(Position = Position.y) %>%
group_by(Date, Position) %>%
# if Load is missing, take the mean of Loads which is grouped
mutate(Load2 = coalesce(PL_Avg,
mean(PL_Avg, na.rm = T)))
答案 1 :(得分:0)
通用Base R解决方案:
# Mark out NA rows flatten in single observation (each element denoting a vector of the df):
is_val_na <- apply(data.frame(lapply(DummyLoads,
function(x){is.na(x)})), 1, paste, collapse = ", " )
# Split up using the grouping var "Name", and impute the mean where NA, coerce list to df:
DummyLoads_imputed <- do.call("rbind", lapply(split(DummyLoads, DummyLoads$Position),
function(x){
if(is.numeric(x)){
ifelse(is.na(x), mean(x, na.rm = TRUE), x)
}else{x}
}
)
)
# Bind the data.frame with a factor vector holding the T/F values:
DummyLoads_imputed <- cbind(DummyLoads_imputed, row_na = as.factor(is_val_na))
使用的数据:
DummyLoads <- structure(list(Name = structure(c(4L, 1L, 2L, 6L, 3L, 5L, 4L, 1L, 2L, 3L, 5L, 4L, 1L, 2L, 6L, 3L, 5L, 2L, 6L, 3L, 5L),
.Label = c("Bob", "Dave", "Fred", "Jim", "Ray", "Steve"),
class = "factor"), Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("2019-10-19", "2019-10-20", "2019-10-21", "2019-10-22"),
class = "factor"), Load = c(900L, 900L, 900L, 850L, 850L, 850L, 789L, 789L, 789L, 960L,
960L, 909L, 909L, 909L, 991L, 991L, 991L, 720L, 717L, 717L, 717L),
Position = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L),
.Label = c("Defense", "Forward"), class = "factor")), row.names = c(NA, -21L), class = "data.frame")