如何使用`sapply()`来归类类别中的组的值?

时间:2018-05-28 12:38:43

标签: r sapply imputation

我希望val中所有ctry的{​​{1}}中的cat1以特定ctry方式归咎于遗漏。

数据示例

set.seed(654)
df1 <- data.frame(
  year=rep(2000:2005, each=5),
  ctry=rep(LETTERS[1:5], 6),
  val=rnorm(30)
)
df1$cat <- ifelse(df1$ctry %in% c("A", "B"), 1, 0)
df1[sample(nrow(df1), 12), "val"] <- NA
> head(df1)
  year ctry         val cat
1 2000    A -0.76031762   1
2 2000    B -0.38970450   1
3 2000    C  1.68962523   0
4 2000    D          NA   0
5 2000    E  0.09530146   0
6 2001    A          NA   1

首先,我在ctry中获取cat1的名称并分配他们的资金。

cat1 <- as.character(sort(unique(
  df1[!is.na(df1$val) & df1$cat == 1, ]
  [, 2])))
cat1 <- sapply(cat1, function(x) mean(df1$val[df1$ctry == x], na.rm=TRUE))
> cat1
        A         B 
0.4372003 0.4792314 

现在我成功地按国家/地区手动输入:

df2 <- df1
df2$val[df2$ctry %in% names(cat1)[1] & is.na(df2$val)] <- cat1[1]
> head(df2)
  year ctry         val cat
1 2000    A -0.76031762   1
2 2000    B -0.38970450   1
3 2000    C  1.68962523   0
4 2000    D          NA   0
5 2000    E  0.09530146   0
6 2001    A -0.49758245   1

但由于某种原因,我无法使sapply()工作,自动进行估算:

> sapply(seq_along(cat1), 
+        function(x) df2$val[df2$ctry %in% names(cat1)[x] & is.na(df2$val)] <- cat1[x])
         A          B 
-0.4975825 -0.6139364 

预期输出将是一个完整的数据框架,具有cat1类别国家的特定估算方式。

3 个答案:

答案 0 :(得分:1)

您可以使用tidyverse尝试group_by方法,以获得每ctry的平均值。然后使用NA更新ifelse。添加了一个新列val2来说明正在发生的事情。您可以编写"val"来覆盖该列。

library(tidyverse)
df1 %>% 
  group_by(ctry) %>% 
  mutate(Mean=mean(val, na.rm = T)) %>% 
  mutate(val2=ifelse(is.na(val) & cat == 1, Mean, val)) %>% 
  ungroup()
# A tibble: 30 x 6
    year ctry       val   cat    Mean     val2
   <int> <fct>    <dbl> <dbl>   <dbl>    <dbl>
 1  2000 A      -0.760      1 -0.498   -0.760 
 2  2000 B      -0.390      1 -0.614   -0.390 
 3  2000 C       1.69       0  0.397    1.69  
 4  2000 D      NA          0 -0.0321  NA     
 5  2000 E       0.0953     0 -0.513    0.0953
 6  2001 A      NA          1 -0.498   -0.498 
 7  2001 B      NA          1 -0.614   -0.614 
 8  2001 C      NA          0  0.397   NA     
 9  2001 D      NA          0 -0.0321  NA     
10  2001 E      NA          0 -0.513   NA     
# ... with 20 more rows

答案 1 :(得分:1)

如果我的理解是正确的,你想要自动完成最后一个过程

sapply(seq_along(cat1), 
 function(x) df2$val[df2$ctry %in% names(cat1)[x] & is.na(df2$val)] <<- cat1[x])

> df2
   year ctry          val cat
1  2000    A -0.760317618   1
2  2000    B -0.389704501   1
3  2000    C  1.689625228   0
4  2000    D           NA   0
5  2000    E  0.095301460   0
6  2001    A -0.497582454   1
7  2001    B -0.613936417   1
8  2001    C           NA   0
9  2001    D           NA   0
10 2001    E           NA   0
11 2002    A -0.107260116   1
12 2002    B -0.838168333   1
13 2002    C -0.982605890   0
14 2002    D -0.820370986   0
15 2002    E -0.871432562   0
16 2003    A -0.497582454   1
17 2003    B -0.613936417   1
18 2003    C -0.006557849   0
19 2003    D  0.661696295   0
20 2003    E -0.762828067   0
21 2004    A -0.286692466   1
22 2004    B -0.613936417   1
23 2004    C  0.512579937   0
24 2004    D  0.722127317   0
25 2004    E           NA   0
26 2005    A -0.836059616   1
27 2005    B -0.613936417   1
28 2005    C  0.774016151   0
29 2005    D -0.691866605   0
30 2005    E           NA   0

我只需将<替换为<<- scoping assignment

答案 2 :(得分:1)

在基地R:

set.seed(654)
df1 <- data.frame(
  year=rep(2000:2005, each=5),
  ctry=rep(LETTERS[1:5], 6),
  val=rnorm(30)
)
df1$cat <- ifelse(df1$ctry %in% c("A", "B"), 1, 0)
df1[sample(nrow(df1), 12), "val"] <- NA

# want:
my_means <- tapply(df1$val, df1$ctry, mean, na.rm = TRUE)
df1$val <- ifelse(is.na(df1$val), my_means[df1$ctry], df1$val)