我有第一个不完整的数据集 data_incom
和第二个包含第一个 data_to_com
的缺失值。使用 mutate(UG = case_when (INSEE == "07185" ~ 6, etc))
,覆盖“UG”列。请问如何使用 tidyverse 工具将第一个数据集中的 NA 替换为第二个表中的值?
谢谢!
data_incom <- structure(list(INSEE = c("07005", "07005", "07010", "07011",
"07011", "07012", "07019", "07025", "07026", "07032", "07033",
"07042", "07064", "07066", "07068", "07069", "07075", "07088",
"07096", "07099", "07101", "07101", "07105", "07105", "07107",
"07110", "07117", "07117", "07119", "07128", "07129", "07131",
"07144", "07153", "07154", "07159", "07161", "07161", "07168",
"07172", "07173", "07185", "07186", "07202", "07204", "07228",
"07232", "07240", "07261", "07265", "07273", "07279", "07284",
"07286", "07294", "07301", "07315", "07329", "07330", "07331",
"07338", "07338", "07347", "07187", "07265", "07334", "07262"
), UG = c(NA, NA, 2L, NA, NA, 10L, 13L, 28L, 26L, 15L, 21L, 19L,
11L, 16L, 8L, 6L, 26L, 25L, 11L, 18L, 21L, 21L, 26L, 26L, 24L,
25L, 25L, 25L, NA, 3L, 8L, 22L, 24L, NA, 28L, NA, 28L, 28L, 21L,
1L, 12L, NA, 15L, 24L, 7L, 1L, 24L, 9L, 9L, 2L, 18L, 19L, NA,
11L, 21L, 6L, NA, 24L, 18L, 28L, 8L, 8L, 3L, 24L, 2L, 20L, 24L
)), row.names = c(NA, -67L), class = "data.frame")
data_to_com <-structure(list(INSEE=c("07185", "07284", "07315", "07153", "07119", "07159", "070005"),
UG=c(6L,20L,24L,28L,26L,15L,17L)), row.names = c(NA,7L), class = "data.frame")
答案 0 :(得分:3)
您可以使用以下解决方案。第一个数据集中有一些 INSEE
值在第二个数据集中不存在,我只是将它们保留为 NA
值。
library(dplyr)
library(tidyr)
data_incom %>%
filter(is.na(UG)) %>%
rowwise() %>%
mutate(UG = list(data_to_com$UG[grepl(INSEE, data_to_com$INSEE)])) %>%
unnest(cols = c(UG)) -> data_com
data_com %>%
bind_rows(data_incom %>%
filter(!INSEE %in% data_com$INSEE)) %>%
arrange(INSEE)
# A tibble: 67 x 2
INSEE UG
<chr> <int>
1 07005 NA
2 07005 NA
3 07010 2
4 07011 NA
5 07011 NA
6 07012 10
7 07019 13
8 07025 28
9 07026 26
10 07032 15
# ... with 57 more rows
答案 1 :(得分:3)
在这些场景中使用 coalesce
。
left_join
将导致包含来自 incom
的所有行coalesce
.keep = 'unused'
以仅保留想要的行library(dplyr)
data_incom %>% left_join(data_to_com, by = 'INSEE') %>%
mutate(UG = coalesce(UG.x, UG.y), .keep = 'unused')
INSEE UG
1 07005 NA
2 07005 NA
3 07010 2
4 07011 NA
5 07011 NA
6 07012 10
7 07019 13
8 07025 28
9 07026 26
10 07032 15
11 07033 21
12 07042 19
13 07064 11
14 07066 16
15 07068 8
16 07069 6
17 07075 26
18 07088 25
19 07096 11
20 07099 18
21 07101 21
22 07101 21
23 07105 26
24 07105 26
25 07107 24
26 07110 25
27 07117 25
28 07117 25
29 07119 26
30 07128 3
31 07129 8
32 07131 22
33 07144 24
34 07153 28
35 07154 28
36 07159 15
37 07161 28
38 07161 28
39 07168 21
40 07172 1
41 07173 12
42 07185 6
43 07186 15
44 07202 24
45 07204 7
46 07228 1
47 07232 24
48 07240 9
49 07261 9
50 07265 2
51 07273 18
52 07279 19
53 07284 20
54 07286 11
55 07294 21
56 07301 6
57 07315 24
58 07329 24
59 07330 18
60 07331 28
61 07338 8
62 07338 8
63 07347 3
64 07187 24
65 07265 2
66 07334 20
67 07262 24