我想通过从缺失数据所属组的非NA值中采样来填补缺失数据(NA)。
group =出口
此外,如果nrows(group)== 1,则从所有组中抽取随机值(例如,Outlet == ESO0244476)。
这是我到目前为止尝试过的:
样本数据
> dput(droplevels((example)))
structure(list(LENGTH = c(NA, 45447.4157838775, 161037.71538108,
78147.8550470324, 7193.48815617057, 1571.95459212405, 18191.381972185,
20366.2132412031, 10014.987524596, 1403.72511829297, 5651.17842991513,
6848.03271105711, 8043.32937011393, 8926.65133418451, 5808.44456603825,
2208.14264175252, 1797.4936747033, 5325.76651327694, 2660.66730207955,
5844.07912541444, 3956.40473896271, 959.873314407621, 3294.01472360025,
5221.94864001864, 3781.51913857335, 7811.83819953768, 3387.20323328623,
5514.92099458441, 5792.54371531706, 5643.98385143961, 15478.916809379,
8401.66533205217, 7046.25074819247, 2734.73639821402, NA, 62332.3343404513,
NA, 46563.1214718113, 25590.4020105238, 13015.3682275862, 4984.80432801441,
NA), NUMPOINTS = c(NA, 0, 8, 5, 2, 0, 9, 0, 0, 0, 3, 1, 0, 6,
1, 1, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, NA,
10, NA, 19, 6, 5, 0, NA), CTRY_ = structure(c(1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L,
3L, 2L, 2L, 2L, 2L, 1L), .Label = c("WCY_____ES", "WCY_____FR",
"WCY_____IT"), class = "factor"), Outlet = structure(c(4L, 3L,
3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("DSO0045543", "DSO0208527",
"DSO0213034", "ESO0244476"), class = "factor")), row.names = c(NA,
-42L), class = "data.frame")
示例代码
exam %>%
mutate(nuLENGTH = if_else(CTRY_ == 'WCY_____FR', LENGTH, sample(LENGTH[!is.na(LENGTH) & Outlet == Outlet], 1, TRUE)),
nuNUMPOINTS = if_else(CTRY_ == 'WCY_____FR', NUMPOINTS, sample(NUMPOINTS[!is.na(NUMPOINTS) & Outlet == Outlet], 1, TRUE)))
我希望在每个组中nuLENGTH
和nuNUMPOINTS
的值CTRY_ != 'WCY_____FR'
在组之间以及组内都不同,例如,不相同
LENGTH NUMPOINTS CTRY_ Outlet nuLENGTH nuNUMPOINTS
1 NA NA WCY_____ES ESO0244476 1000.0000 0
2 45447.4158 0 WCY_____FR DSO0213034 45447.4158 0
3 161037.7154 8 WCY_____FR DSO0213034 161037.7154 8
4 78147.8550 5 WCY_____FR DSO0213034 78147.8550 5
5 7193.4882 2 WCY_____FR DSO0045543 7193.4882 2
6 1571.9546 0 WCY_____FR DSO0045543 1571.9546 0
7 18191.3820 9 WCY_____FR DSO0045543 18191.3820 9
8 20366.2132 0 WCY_____FR DSO0045543 20366.2132 0
9 10014.9875 0 WCY_____FR DSO0045543 10014.9875 0
10 1403.7251 0 WCY_____FR DSO0045543 1403.7251 0
11 5651.1784 3 WCY_____FR DSO0045543 5651.1784 3
12 6848.0327 1 WCY_____FR DSO0045543 6848.0327 1
13 8043.3294 0 WCY_____FR DSO0045543 8043.3294 0
14 8926.6513 6 WCY_____FR DSO0045543 8926.6513 6
15 5808.4446 1 WCY_____FR DSO0045543 5808.4446 1
16 2208.1426 1 WCY_____FR DSO0045543 2208.1426 1
17 1797.4937 0 WCY_____FR DSO0045543 1797.4937 0
18 5325.7665 0 WCY_____FR DSO0045543 5325.7665 0
19 2660.6673 1 WCY_____FR DSO0045543 2660.6673 1
20 5844.0791 0 WCY_____FR DSO0045543 5844.0791 0
21 3956.4047 0 WCY_____FR DSO0045543 3956.4047 0
22 959.8733 0 WCY_____FR DSO0045543 959.8733 0
23 3294.0147 1 WCY_____FR DSO0045543 3294.0147 1
24 5221.9486 2 WCY_____FR DSO0045543 5221.9486 2
25 3781.5191 0 WCY_____FR DSO0045543 3781.5191 0
26 7811.8382 0 WCY_____FR DSO0045543 7811.8382 0
27 3387.2032 0 WCY_____FR DSO0045543 3387.2032 0
28 5514.9210 0 WCY_____FR DSO0045543 5514.9210 0
29 5792.5437 1 WCY_____FR DSO0045543 5792.5437 1
30 5643.9839 0 WCY_____FR DSO0045543 5643.9839 0
31 15478.9168 0 WCY_____FR DSO0045543 15478.9168 0
32 8401.6653 0 WCY_____FR DSO0045543 8401.6653 0
33 7046.2507 1 WCY_____FR DSO0045543 7046.2507 1
34 2734.7364 0 WCY_____FR DSO0045543 2734.7364 0
35 NA NA WCY_____IT DSO0045543 7046.2507 0
36 62332.3343 10 WCY_____FR DSO0045543 62332.3343 10
37 NA NA WCY_____IT DSO0045543 3000.0000 0
38 46563.1215 19 WCY_____FR DSO0208527 46563.1215 19
39 25590.4020 6 WCY_____FR DSO0208527 25590.4020 6
40 13015.3682 5 WCY_____FR DSO0208527 13015.3682 5
41 4984.8043 0 WCY_____FR DSO0208527 4984.8043 0
42 NA NA WCY_____ES DSO0208527 2000.0000 0
答案 0 :(得分:1)
这里有dplyr
但是,正如@Ronak所说,同一组中没有其他观察值可以在第一行中进行采样。
example %>%
group_by(Outlet) %>%
mutate(
nuLENGTH = case_when(
is.na(LENGTH) ~ sample(LENGTH,size=1),
!is.na(LENGTH) ~ LENGTH),
nuNUMPOINTS = case_when(
is.na(NUMPOINTS) ~ sample(NUMPOINTS,size=1),
!is.na(NUMPOINTS) ~ NUMPOINTS))
答案 1 :(得分:1)
如果NA
中有LENGTH
个值并且组中的行数大于1,我们可以编写一个函数,从除NA
值之外的组中选择随机值。如果组中的值数仅为1,则从整个数据中选择NA
个值之外的随机值。我们对列LENGTH
和NUMPOINTS
都应用该函数。
replace_func <- function(x, y) {
inds <- is.na(x)
if (length(x) > 1 & any(inds)) {
x[inds] <- sample(x[!inds], sum(inds))
x
}
else if(any(inds)) {
x[inds] <- sample(y[!is.na(y)], 1)
x
} else x
}
library(dplyr)
exam %>%
group_by(Outlet) %>%
mutate(nuLENGTH = replace_func(LENGTH, .$LENGTH),
nuNUMPOINTS = replace_func(NUMPOINTS, .$NUMPOINTS))
# LENGTH NUMPOINTS CTRY_ Outlet nuLENGTH nuNUMPOINTS
# <dbl> <dbl> <fct> <fct> <dbl> <dbl>
# 1 NA NA WCY_____ES ESO0244476 7046. 19
# 2 45447. 0 WCY_____FR DSO0213034 45447. 0
# 3 161038. 8 WCY_____FR DSO0213034 161038. 8
# 4 78148. 5 WCY_____FR DSO0213034 78148. 5
# 5 7193. 2 WCY_____FR DSO0045543 7193. 2
# 6 1572. 0 WCY_____FR DSO0045543 1572. 0
# 7 18191. 9 WCY_____FR DSO0045543 18191. 9
# 8 20366. 0 WCY_____FR DSO0045543 20366. 0
# 9 10015. 0 WCY_____FR DSO0045543 10015. 0
#10 1404. 0 WCY_____FR DSO0045543 1404. 0
# … with 32 more rows