模拟数据:
set.seed(1)
df1 <- data.frame(country=c("US", "UK"),
year=c(2000, 2003))
df2 <- data.frame(country=rep(c("US", "UK"), 10),
year=rep(2000:2009, 2),
myvar=rnorm(20))
df1
包含感兴趣的国家/地区年份。我希望获得此国家/地区的myvar
值以及之前和之后的3年。
换句话说,合并是基于df2$country==df1$country
和df2$year > df1$year - 3 & df2$year < df1$year + 3
编辑:我的(工作,不优雅)解决方案是填充df1
以创建我感兴趣的所有国家/地区年份,然后以常规方式与df2
合并。
library(plyr)
ddply(df1, c("country", "year"),
function(df) data.frame(rep(df$country, 7), (df$year-3):(df$year+3)))
产生
country year rep.df.country..7. X.df.year...3...df.year...3.
1 UK 2003 UK 2000
2 UK 2003 UK 2001
3 UK 2003 UK 2002
4 UK 2003 UK 2003
5 UK 2003 UK 2004
6 UK 2003 UK 2005
7 UK 2003 UK 2006
8 US 2000 US 1997
9 US 2000 US 1998
10 US 2000 US 1999
11 US 2000 US 2000
12 US 2000 US 2001
13 US 2000 US 2002
14 US 2000 US 2003
答案 0 :(得分:1)
合并在哪里?这听起来像是一个子集问题,除非我误解了这个问题(正如我常常所做的那样)
set.seed(1)
df1 <- data.frame(country=c("US", "UK"),
year=c(2000, 2003))
df2 <- data.frame(country=rep(c("US", "UK"), 10),
year=rep(2000:2009, 2),
myvar=rnorm(20))
f <- lapply(df1$country, function(x) {
tmp <- df2[df2$country == x, ]
tmp[abs(tmp$year - df1[df1$country == x, 'year']) <= 3, ]
})
do.call(rbind, f)
# country year myvar
# 1 US 2000 -0.62645381
# 3 US 2002 -0.83562861
# 11 US 2000 1.51178117
# 13 US 2002 -0.62124058
# 2 UK 2001 0.18364332
# 4 UK 2003 1.59528080
# 6 UK 2005 -0.82046838
# 12 UK 2001 0.38984324
# 14 UK 2003 -2.21469989
# 16 UK 2005 -0.04493361
修改
set.seed(1)
df1 <- data.frame(country=c("US", "UK"),
year=c(2000, 2003, 2009, 2009))
df2 <- data.frame(country=rep(c("US", "UK"), 10),
year=rep(2000:2009, 2),
myvar=rnorm(20))
f <- lapply(seq_len(nrow(df1)), function(x) {
y <- df1[x, 'country']
tmp <- df2[df2$country == y, ]
tmp[abs(tmp$year - df1[x, 'year']) <= 3, ]
})
do.call(rbind, f)
# country year myvar
# 1 US 2000 -0.62645381
# 3 US 2002 -0.83562861
# 11 US 2000 1.51178117
# 13 US 2002 -0.62124058
# 2 UK 2001 0.18364332
# 4 UK 2003 1.59528080
# 6 UK 2005 -0.82046838
# 12 UK 2001 0.38984324
# 14 UK 2003 -2.21469989
# 16 UK 2005 -0.04493361
# 7 US 2006 0.48742905
# 9 US 2008 0.57578135
# 17 US 2006 -0.01619026
# 19 US 2008 0.82122120
# 8 UK 2007 0.73832471
# 10 UK 2009 -0.30538839
# 18 UK 2007 0.94383621
# 20 UK 2009 0.59390132
答案 1 :(得分:1)
在data.table中使用foverlaps的试用
set.seed(1)
df1 <- data.frame(country=c("US", "UK"),
year=c(2000, 2003, 2009, 2009))
df2 <- data.frame(country=rep(c("US", "UK"), 10),
year=rep(2000:2009, 2),
myvar=rnorm(20))
library(data.table)
setDT(df1); setDT(df2) # convert to data table
df1[, c("start", "end") := list(year-2, year+2)]
setkey(df1, country, start, end)
setkey(df2[, year2:=year], country, year, year2)
foverlaps(df1, df2, type="any")[,4:7:=NULL][]
country year myvar
1: UK 2001 0.18364332
2: UK 2001 0.38984324
3: UK 2003 1.59528080
4: UK 2003 -2.21469989
5: UK 2005 -0.82046838
6: UK 2005 -0.04493361
7: UK 2007 0.73832471
8: UK 2007 0.94383621
9: UK 2009 -0.30538839
10: UK 2009 0.59390132
11: US 2000 -0.62645381
12: US 2000 1.51178117
13: US 2002 -0.83562861
14: US 2002 -0.62124058
15: US 2008 0.57578135
16: US 2008 0.82122120
答案 2 :(得分:0)
使用data.table
library(data.table) # v1.9.7 (devel version)
# go here for install instructions
# https://github.com/Rdatatable/data.table/wiki/Installation
# convert datasets into data.table
setDT(df1)
setDT(df2)
# create conditional columns in df1
df1[, yearplus3 := year +3 ][, yearminus3 := year - 3 ]
# merge
output <- df1[df2, on = .(country = country , # condition 1
yearminus3 < year, # condition 2
yearplus3 > year), nomatch = 0 , # condition 3
.(country, year, myvar )] # indicate columns in the output
output
> country year myvar
>1: US 2000 -0.62645381
>2: UK 2003 0.18364332
>3: US 2000 -0.83562861
>4: UK 2003 1.59528080
>5: UK 2003 -0.82046838
>6: US 2000 1.51178117
>7: UK 2003 0.38984324
>8: US 2000 -0.62124058
PS。请注意,截至今天(2016年5月12日),参数on =
仍处于data.table
的开发版本中