tidyverse 在条件下用其他数据框值替换 NA

时间:2021-06-11 15:27:15

标签: r tidyverse

我有第一个不完整的数据集 data_incom 和第二个包含第一个 data_to_com 的缺失值。使用 mutate(UG = case_when (INSEE == "07185" ~ 6, etc)),覆盖“UG”列。请问如何使用 tidyverse 工具将第一个数据集中的 NA 替换为第二个表中的值?

谢谢!

data_incom <- structure(list(INSEE = c("07005", "07005", "07010", "07011", 
                                       "07011", "07012", "07019", "07025", "07026", "07032", "07033", 
                                       "07042", "07064", "07066", "07068", "07069", "07075", "07088", 
                                       "07096", "07099", "07101", "07101", "07105", "07105", "07107", 
                                       "07110", "07117", "07117", "07119", "07128", "07129", "07131", 
                                       "07144", "07153", "07154", "07159", "07161", "07161", "07168", 
                                       "07172", "07173", "07185", "07186", "07202", "07204", "07228", 
                                       "07232", "07240", "07261", "07265", "07273", "07279", "07284", 
                                       "07286", "07294", "07301", "07315", "07329", "07330", "07331", 
                                       "07338", "07338", "07347", "07187", "07265", "07334", "07262"
), UG = c(NA, NA, 2L, NA, NA, 10L, 13L, 28L, 26L, 15L, 21L, 19L, 
          11L, 16L, 8L, 6L, 26L, 25L, 11L, 18L, 21L, 21L, 26L, 26L, 24L, 
          25L, 25L, 25L, NA, 3L, 8L, 22L, 24L, NA, 28L, NA, 28L, 28L, 21L, 
          1L, 12L, NA, 15L, 24L, 7L, 1L, 24L, 9L, 9L, 2L, 18L, 19L, NA, 
          11L, 21L, 6L, NA, 24L, 18L, 28L, 8L, 8L, 3L, 24L, 2L, 20L, 24L
)), row.names = c(NA, -67L), class = "data.frame")


data_to_com <-structure(list(INSEE=c("07185", "07284", "07315", "07153", "07119", "07159", "070005"), 
                             UG=c(6L,20L,24L,28L,26L,15L,17L)), row.names = c(NA,7L), class = "data.frame")

2 个答案:

答案 0 :(得分:3)

您可以使用以下解决方案。第一个数据集中有一些 INSEE 值在第二个数据集中不存在,我只是将它们保留为 NA 值。

library(dplyr)
library(tidyr)

data_incom %>%
  filter(is.na(UG)) %>%
  rowwise() %>%
  mutate(UG = list(data_to_com$UG[grepl(INSEE, data_to_com$INSEE)])) %>%
  unnest(cols = c(UG)) -> data_com

data_com %>%
  bind_rows(data_incom %>% 
              filter(!INSEE %in% data_com$INSEE)) %>%
  arrange(INSEE)


# A tibble: 67 x 2
   INSEE    UG
   <chr> <int>
 1 07005    NA
 2 07005    NA
 3 07010     2
 4 07011    NA
 5 07011    NA
 6 07012    10
 7 07019    13
 8 07025    28
 9 07026    26
10 07032    15
# ... with 57 more rows

答案 1 :(得分:3)

在这些场景中使用 coalesce

  • 使用 left_join 将导致包含来自 incom 的所有行
  • 之后使用coalesce
  • 进一步在 mutate 参数中使用 .keep = 'unused' 以仅保留想要的行
library(dplyr)

data_incom %>% left_join(data_to_com, by = 'INSEE') %>%
  mutate(UG = coalesce(UG.x, UG.y), .keep = 'unused')

   INSEE UG
1  07005 NA
2  07005 NA
3  07010  2
4  07011 NA
5  07011 NA
6  07012 10
7  07019 13
8  07025 28
9  07026 26
10 07032 15
11 07033 21
12 07042 19
13 07064 11
14 07066 16
15 07068  8
16 07069  6
17 07075 26
18 07088 25
19 07096 11
20 07099 18
21 07101 21
22 07101 21
23 07105 26
24 07105 26
25 07107 24
26 07110 25
27 07117 25
28 07117 25
29 07119 26
30 07128  3
31 07129  8
32 07131 22
33 07144 24
34 07153 28
35 07154 28
36 07159 15
37 07161 28
38 07161 28
39 07168 21
40 07172  1
41 07173 12
42 07185  6
43 07186 15
44 07202 24
45 07204  7
46 07228  1
47 07232 24
48 07240  9
49 07261  9
50 07265  2
51 07273 18
52 07279 19
53 07284 20
54 07286 11
55 07294 21
56 07301  6
57 07315 24
58 07329 24
59 07330 18
60 07331 28
61 07338  8
62 07338  8
63 07347  3
64 07187 24
65 07265  2
66 07334 20
67 07262 24