我有两个行和列长度不同的数据框
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(5, 8, 9),
France = c(4, NA, 7),
Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(1, 9, 7, NA),
France = c(10, 3, 9, 6),
Luxembourg = c(10, NA, NA, 7),
Belgium = c(NA, 8, 1, 9)
) -> df2
我必须创建第三个 df (df3) 其中,
df3 应如下所示:
任何帮助将不胜感激
答案 0 :(得分:4)
我们可以加入 on
'Date' 并使用 fcoalesce
将 NA 替换为相应的非 NA
library(data.table)
nm2 <- intersect(names(df2)[-1], names(df1)[-1])
df3 <- copy(df2)
setDT(df3)[df1, (nm2) := Map(fcoalesce, mget(nm2),
mget(paste0('i.', nm2))), on = .(Date)]
-输出
df3
# Date Germany France Luxembourg Belgium
#1: 01/01/1990 1 10 10 NA
#2: 01/01/2000 9 3 10 8
#3: 01/01/2010 7 9 6 1
#4: 01/01/2020 9 6 7 9
或者这可以通过 tidyverse
library(dplyr)
library(stringr)
left_join(df2, df1, by = 'Date') %>%
mutate(Date, across(ends_with(".x"),
~ coalesce(., get(str_replace(cur_column(), "\\.x$", ".y"))))) %>%
select(-ends_with('.y')) %>%
rename_with(~ str_remove(., "\\.x$"), ends_with('.x'))
答案 1 :(得分:3)
这是另一个 data.table
选项
cols <- setdiff(intersect(names(df1), names(df2)), "Date")
setDT(df1)[setDT(df2),
on = "Date"
][
,
c(cols) :=
Map(
fcoalesce,
.SD[, cols, with = FALSE],
.SD[, paste0("i.", cols), with = FALSE]
)
][,
.SD,
.SDcols = patterns("^[^i]")
]
给予
Date Germany France Luxembourg Belgium
1: 01/01/1990 1 10 10 NA
2: 01/01/2000 5 4 10 8
3: 01/01/2010 8 9 6 1
4: 01/01/2020 9 7 3 9
答案 2 :(得分:1)
基础 R 解决方案:
# Store as a variable a list denoting each data.frame's column names:
# cnames => character vector
cnames <- list(names(df1), names(df2))
# Determine which vector of names is required in the resulting data.frame
# required_vecs => character vector
required_vecs <- cnames[[which.max(lengths(cnames))]]
# Merge the data: full_data => data.frame
full_data <- merge(
df1,
df2,
by = "Date",
all = TRUE
)
# Resolve the vector names of vectors requiring coalescing:
# clsce_required_vecs=> character vector
clsce_required_vecs <- setdiff(intersect(names(df1), names(df2)), c("Date"))
# Resolve the vector names of vectors not requiring coalescing:
# nt_rqrd_vecs => character vector
nt_rqrd_vecs <- setdiff(unlist(cnames), clsce_required_vecs)
# Split-Apply-Combine data requiring coalescing: coalesced_data => data.frame
coalesced_data <- setNames(
data.frame(
do.call(
cbind,
lapply(
clsce_required_vecs,
function(x) {
# Subset the data to only contain relevant vectors: sbst => data.frame
sbst <- full_data[,grepl(x, names(full_data))]
# Split each column (of the same data) into a data.frame in a list:
# same_vecs => list of data.frames
same_vecs <- split.default(sbst, seq_len(ncol(sbst)))
# Rename the data.frames as required and row-bind them into a single df:
# vector => GlobalEnv()
Reduce(
function(y, z){
replace(y, is.na(y), z[is.na(y)])
},
do.call(cbind, same_vecs)
)
}
)
), row.names = NULL),
clsce_required_vecs)
# Column bind and order the columns:
res <- cbind(full_data[, nt_rqrd_vecs], coalesced_data)[,required_vecs]
答案 3 :(得分:1)
library(tidyverse)
library(lubridate)
df1 <- tibble::tribble(
~Date, ~Germany, ~France, ~Luxembourg,
"01/01/2000", 5, 4, 10,
"01/01/2010", 8, NA, 6,
"01/01/2020", 9, 7, 3
)
df2 <- tibble::tribble(
~Date, ~Germany, ~France, ~Luxembourg, ~Belgium,
"01/01/1990", 1, 10, 10, NA,
"01/01/2000", 9, 3, NA, 8,
"01/01/2010", 7, 9, NA, 1,
"01/01/2020", NA, 6, 7, 9
)
bind_rows(df1 %>%
mutate(priority = 1),
df2 %>%
mutate(priority = 2)) %>%
mutate(Date = lubridate::dmy(Date)) %>%
group_by(Date) %>%
arrange(priority) %>%
summarise(across(-priority, ~ first(na.omit(.))))
#> # A tibble: 4 x 5
#> Date Germany France Luxembourg Belgium
#> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 1990-01-01 1 10 10 NA
#> 2 2000-01-01 5 4 10 8
#> 3 2010-01-01 8 9 6 1
#> 4 2020-01-01 9 7 3 9
答案 4 :(得分:1)
仅使用 dplyr
的 mutate(across..
方法。
我还建议使用 full_join
而不是 left_join
或 right_join
,因为 full_join
将采用 df1
或 df2
中的所有行,而不是向左或向右联接。
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(5, 8, 9),
France = c(4, NA, 7),
Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(1, 9, 7, NA),
France = c(10, 3, 9, 6),
Luxembourg = c(10, NA, NA, 7),
Belgium = c(NA, 8, 1, 9)
) -> df2
library(dplyr)
df1 %>% full_join(df2, by = 'Date', suffix = c('_x', '_y')) %>%
mutate(across(ends_with('_x'), ~coalesce(., get(sub('_x', '_y', cur_column()))),
.names = '{sub("_x", "", {.col})}')) %>%
select(!ends_with('_x') & !ends_with('_y'))
#> Date Belgium Germany France Luxembourg
#> 1 01/01/2000 8 5 4 10
#> 2 01/01/2010 1 8 9 6
#> 3 01/01/2020 9 9 7 3
#> 4 01/01/1990 NA 1 10 10
由 reprex package (v2.0.0) 于 2021 年 5 月 18 日创建