Question

我有两个行和列长度不同的数据框

data.frame(
  stringsAsFactors = FALSE,
              Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
           Germany = c(5, 8, 9),
            France = c(4, NA, 7),
        Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
  stringsAsFactors = FALSE,
              Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
           Germany = c(1, 9, 7, NA),
            France = c(10, 3, 9, 6),
        Luxembourg = c(10, NA, NA, 7),
           Belgium = c(NA, 8, 1, 9)
) -> df2

我必须创建第三个 df (df3) 其中，

df1 的 NA 值通过匹配 ID 和日期替换为 df2 的值，反之亦然（来自 df2 的 NA 被 df1 替换）
df1 的值是优先级 (=TRUE)
所有不在一个数据框中的列（如本例中的比利时）也应包含在 df3 中

df3 应如下所示：

任何帮助将不胜感激

Answer 1

我们可以加入 on 'Date' 并使用 fcoalesce 将 NA 替换为相应的非 NA

library(data.table)
nm2 <- intersect(names(df2)[-1], names(df1)[-1])
df3 <- copy(df2)
setDT(df3)[df1, (nm2) := Map(fcoalesce, mget(nm2),
       mget(paste0('i.', nm2))), on = .(Date)]

-输出

df3
#         Date Germany France Luxembourg Belgium
#1: 01/01/1990       1     10         10      NA
#2: 01/01/2000       9      3         10       8
#3: 01/01/2010       7      9          6       1
#4: 01/01/2020       9      6          7       9

或者这可以通过 tidyverse

完成

library(dplyr)
library(stringr)
left_join(df2, df1, by = 'Date') %>% 
   mutate(Date, across(ends_with(".x"), 
    ~ coalesce(., get(str_replace(cur_column(), "\\.x$", ".y"))))) %>% 
   select(-ends_with('.y')) %>% 
   rename_with(~ str_remove(., "\\.x$"), ends_with('.x'))

Answer 2

这是另一个 data.table 选项

cols <- setdiff(intersect(names(df1), names(df2)), "Date")
setDT(df1)[setDT(df2),
  on = "Date"
][
  ,
  c(cols) :=
    Map(
      fcoalesce,
      .SD[, cols, with = FALSE],
      .SD[, paste0("i.", cols), with = FALSE]
    )
][,
  .SD,
  .SDcols = patterns("^[^i]")
]

给予

         Date Germany France Luxembourg Belgium
1: 01/01/1990       1     10         10      NA
2: 01/01/2000       5      4         10       8
3: 01/01/2010       8      9          6       1
4: 01/01/2020       9      7          3       9

Answer 3

基础 R 解决方案：

# Store as a variable a list denoting each data.frame's column names: 
# cnames => character vector
cnames <- list(names(df1), names(df2))

# Determine which vector of names is required in the resulting data.frame 
# required_vecs => character vector
required_vecs <- cnames[[which.max(lengths(cnames))]]

# Merge the data: full_data => data.frame
full_data <- merge(
  df1,
  df2,
  by = "Date",
  all = TRUE
)

# Resolve the vector names of vectors requiring coalescing: 
# clsce_required_vecs=> character vector
clsce_required_vecs <- setdiff(intersect(names(df1), names(df2)), c("Date"))

# Resolve the vector names of vectors not requiring coalescing: 
# nt_rqrd_vecs => character vector
nt_rqrd_vecs <- setdiff(unlist(cnames), clsce_required_vecs)

# Split-Apply-Combine data requiring coalescing: coalesced_data => data.frame
coalesced_data <- setNames(
  data.frame(
    do.call(
      cbind, 
      lapply(
        clsce_required_vecs, 
        function(x) {
          # Subset the data to only contain relevant vectors: sbst => data.frame
          sbst <- full_data[,grepl(x, names(full_data))]
          # Split each column (of the same data) into a data.frame in a list:
          # same_vecs => list of data.frames
          same_vecs <- split.default(sbst, seq_len(ncol(sbst)))
          # Rename the data.frames as required and row-bind them into a single df:
          # vector => GlobalEnv()
          Reduce(
            function(y, z){
              replace(y, is.na(y), z[is.na(y)])
            }, 
            do.call(cbind, same_vecs)
          )
        }
      )
    ), row.names = NULL), 
clsce_required_vecs)
    
# Column bind and order the columns:
res <- cbind(full_data[, nt_rqrd_vecs], coalesced_data)[,required_vecs]

Answer 4

library(tidyverse)
library(lubridate)

df1 <- tibble::tribble(
  ~Date, ~Germany, ~France, ~Luxembourg,
  "01/01/2000",        5,       4,          10,
  "01/01/2010",        8,      NA,           6,
  "01/01/2020",        9,       7,           3
)
df2 <- tibble::tribble(
  ~Date, ~Germany, ~France, ~Luxembourg, ~Belgium,
  "01/01/1990",        1,      10,          10,       NA,
  "01/01/2000",        9,       3,          NA,        8,
  "01/01/2010",        7,       9,          NA,        1,
  "01/01/2020",       NA,       6,           7,        9
)

bind_rows(df1 %>%
            mutate(priority = 1),
          df2 %>%
            mutate(priority = 2)) %>%
  mutate(Date = lubridate::dmy(Date)) %>%
  group_by(Date) %>%
  arrange(priority) %>%
  summarise(across(-priority, ~ first(na.omit(.))))
#> # A tibble: 4 x 5
#>   Date       Germany France Luxembourg Belgium
#>   <date>       <dbl>  <dbl>      <dbl>   <dbl>
#> 1 1990-01-01       1     10         10      NA
#> 2 2000-01-01       5      4         10       8
#> 3 2010-01-01       8      9          6       1
#> 4 2020-01-01       9      7          3       9

Answer 5

仅使用 dplyr 的 mutate(across.. 方法。

我还建议使用 full_join 而不是 left_join 或 right_join，因为 full_join 将采用 df1 或 df2 中的所有行，而不是向左或向右联接。

data.frame(
  stringsAsFactors = FALSE,
  Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
  Germany = c(5, 8, 9),
  France = c(4, NA, 7),
  Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
  stringsAsFactors = FALSE,
  Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
  Germany = c(1, 9, 7, NA),
  France = c(10, 3, 9, 6),
  Luxembourg = c(10, NA, NA, 7),
  Belgium = c(NA, 8, 1, 9)
) -> df2

library(dplyr)


df1 %>% full_join(df2, by = 'Date', suffix = c('_x', '_y')) %>%
  mutate(across(ends_with('_x'), ~coalesce(., get(sub('_x', '_y', cur_column()))),
                .names = '{sub("_x", "", {.col})}')) %>%
  select(!ends_with('_x') & !ends_with('_y'))

#>         Date Belgium Germany France Luxembourg
#> 1 01/01/2000       8       5      4         10
#> 2 01/01/2010       1       8      9          6
#> 3 01/01/2020       9       9      7          3
#> 4 01/01/1990      NA       1     10         10

^{由 reprex package (v2.0.0) 于 2021 年 5 月 18 日创建}

用 R 中匹配的 ID 和日期替换两个数据框的 NA 值

5 个答案: