我有一个名为df1
的数据帧,该数据帧有四列(即id
,s
,date
和value
)。值列为空,我想使用名为df2
的第二个数据框填充它。 df2
充满了id
列和许多其他列,这些列均使用其所属的日期来命名。我需要做的是在df1$value
中找到df2
的相应值,其中日期和ID号都匹配。
示例数据:
set.seed(123)
#df1
df1 <- data.frame(id = 1:100,
s = runif(100,100,1000),
date = sample(seq(as.Date('1999/01/01'), as.Date('2001/01/01'), by="day"), 100),
value = NA)
#df2
df2 <- data.frame(matrix(runif(80000,1,100), ncol=800, nrow=100))[-1]
names(df2) <- seq(as.Date("1999-01-01"),as.Date("2002-12-31"),1)[c(1:799)]
df2 <- cbind(id = 1:100, df2)
答案 0 :(得分:7)
一种方法是使用df2
将gather
转换为长格式,然后执行left_join
library(dplyr)
library(tidyr)
df1 %>%
left_join(df2 %>%
gather(date, value, -id) %>%
mutate(date = as.Date(date)), by = c("id", "date"))
# id s date value
#1 1 359 2000-03-15 48.32
#2 2 809 1999-09-01 62.16
#3 3 468 1999-12-23 16.41
#4 4 895 2000-11-26 32.70
#5 5 946 1999-12-18 5.84
#6 6 141 2000-10-09 74.65
#7 7 575 2000-10-25 9.22
#8 8 903 2000-03-17 6.46
#9 9 596 1999-10-25 73.48
#10 10 511 1999-04-17 62.43
#...
数据
set.seed(123)
df1 <- data.frame(id = 1:100,
s = runif(100,100,1000),
date = sample(seq(as.Date('1999/01/01'), as.Date('2001/01/01'), by="day"), 100))
df2 <- data.frame(matrix(runif(80000,1,100), ncol=800, nrow=100))[-1]
names(df2) <- seq(as.Date("1999-01-01"),as.Date("2002-12-31"),1)[c(1:799)]
df2 <- cbind(id = 1:100, df2)
答案 1 :(得分:4)
您还可以使用melt,然后使用两个键进行左联接:
library(dplyr)
library(reshape2)
set.seed(123)
#df1
df1 <- data.frame(id = 1:100,
s = runif(100,100,1000),
date = sample(seq(as.Date('1999/01/01'), as.Date('2001/01/01'), by="day"), 100),
value = NA)
#df2
df2 <- data.frame(matrix(runif(80000,1,100), ncol=800, nrow=100))[-1]
names(df2) <- seq(as.Date("1999-01-01"),as.Date("2002-12-31"),1)[c(1:799)]
df2 <- cbind(id = 1:100, df2)
df2<-melt(df2, id.vars = "id", value.name = "Value", variable.name = "date")
df2$date<-as.Date(df2$date, format = "%Y-%m-%d")
df1<-left_join(df1, df2, by = c("id", "date"))
head(df1)
id s date value Value
1 1 358.8198 2000-03-15 NA 48.31799
2 2 809.4746 1999-09-01 NA 62.15760
3 3 468.0792 1999-12-23 NA 16.41291
4 4 894.7157 2000-11-26 NA 32.70024
5 5 946.4206 1999-12-18 NA 5.83607
6 6 141.0008 2000-10-09 NA 74.64832
答案 2 :(得分:3)
我们可以通过data.table连接使用高效的方式。大数据集应该很快
library(data.table)
setDT(df1)[melt(setDT(df2), id.var = 'id')[,
date := as.IDate(variable, '%Y-%m-%d')], on = .(id, date)]