我有一个带有时间序列观察的数据框。我想为每个观察添加一个变量,其值与上一年最接近的相似日期和下一年最接近的相似日期相同(例如,2014年5月15日的价值可能是2013年5月13日和5月21日2015年)。是否有一种聪明的方式,例如使用dplyr,这样做?请在下面找到示例代码(大多数代码专注于创建一组随机日期和值,感谢早期的SO问题)。非常感谢提前。
rm -rf schema/label
答案 0 :(得分:1)
这绝对不是一种明智的做法,但我希望有人(也许是你)可以让它变得聪明/漂亮。
library(dplyr)
library(lubridate)
dat <- data.frame(dateval, value)
dat <- dat %>% mutate(year = year(dateval), nv_next = NA, nv_prev = NA)
#You don't really need dplyr just for this...
shifts <- c(1, -1) #nextyear, prevyear
for (s in 1:2) { #Once for each shift
for (i in 1:nrow(dat)) {
otheryear <- dat[dat[,"year"]==dat[i,"year"]+shifts[s],] #Subset the df with only dates of other year
if (nrow(otheryear) == 0) { #Ends if there's no other year
dat[i,3+s] <- NA
} else {
cands <- otheryear$dateval #Candidates to have their value chosen
cands_shifted <- cands
year(cands_shifted) <- dat[i,"year"] #Change the year in cand's copy
nearest_date <- which.min(abs(difftime(dat[i,"dateval"], cands_shifted))) #After the years are the same, the closest date can be calculated with difftime
dat[i,3+s] <- dat[dat$dateval == cands[nearest_date],"value"] #We check back on cands what real date that was, and assign it's value
}
}
}
这导致了
> dat
dateval value year nv_next nv_prev
1 2009-02-14 6.511781 2009 3.010648 5.782136
2 2009-12-23 5.389843 2009 4.943871 5.918977
3 2011-08-01 4.378759 2011 5.074565 4.943871
4 2014-04-07 2.785300 2014 NA 5.417942
5 2008-08-12 6.124931 2008 5.389843 5.593901
6 2014-03-10 4.955066 2014 NA 5.619826
7 2014-07-23 4.983810 2014 NA 5.417942
8 2012-04-14 5.943836 2012 5.417942 4.378759
9 2012-01-13 5.821221 2012 5.619826 4.378759
10 2007-06-30 5.593901 2007 5.782136 NA
11 2008-08-24 5.918977 2008 5.389843 5.593901
12 2008-05-30 5.782136 2008 6.511781 5.593901
13 2012-06-30 5.074565 2012 5.417942 4.378759
14 2010-01-27 3.010648 2010 4.378759 6.511781
15 2013-02-27 5.619826 2013 4.955066 5.821221
16 2010-12-25 4.943871 2010 4.378759 5.389843
17 2012-09-27 4.844204 2012 5.417942 4.378759
18 2014-12-08 3.529248 2014 NA 5.417942
19 2010-01-15 4.521850 2010 4.378759 6.511781
20 2013-03-21 5.417942 2013 4.955066 5.943836
我嵌套了for循环而不是为每个班次使用副本,但是你必须小心nv_next
和nv_prev
,因为它们是通过索引而不是名字选择的。
答案 1 :(得分:0)
Molx的答案比我的更短更好,但是因为我已经把它写出来了,以防万一你需要一个答案,a)重复使用函数,b)写在基础R中,这是我的。
加载数据:
dates = read.table("date_data.txt")
这是一个仅以月份和日期查找天数差异的功能;你想要的,以便不同年份在他们的亲密关系方面进行比较。
#get differences in terms of the months and dates only
compare_dates_days <- function(date1, date2, date_format = "%Y-%m-%d"){
#give them all "blank" years of "00"
month_day_only1 = paste("00", strsplit(date1, "-")[[1]][2], strsplit(date1, "-")[[1]][3], sep = "-")
month_day_only2 = paste("00", strsplit(date2, "-")[[1]][2], strsplit(date2, "-")[[1]][3], sep = "-")
day_difference = as.numeric(as.Date(as.character(month_day_only1, format = "%m-%d")) -
as.Date(as.character(month_day_only2, format = "%m-%d")))
return(day_difference)
}
#testing the above function
a = "2009-02-14"
b = "2009-02-28"
diff = compare_dates_days(a, b)
用于查找向量中非零值的绝对值的最小值的函数。
min_abs_index <- function(v){
v.na = abs(v)
v.na[v==0] = NA
return(c( which.min(v.na) ))
}
这是一个函数,它将一个日期与一个日期向量进行比较,并吐出该日期的指数,即上一年和下一年的最近一天;它使用上述功能。
above_below_year_date <- function(date, date_ref_compare, date_format = "%Y-%m-%d"){
one_year_ahead_diffs = rep(0, length(date_ref_compare))
one_year_behind_diffs = rep(0, length(date_ref_compare))
date_diffs = unlist(lapply(seq_along(1:length(date_ref_compare)),
function(i) compare_dates_days(date_ref_compare[i],date )))
for(i in 1:length(date_ref_compare)){
#calendar year ahead
if(as.numeric(sapply(strsplit(date, "-"),"[[", 1)) -
as.numeric(sapply(strsplit(date_ref_compare[i],
"-"),"[[", 1)) == 1){
one_year_ahead_diffs[i] = date_diffs[i]
}
#calendar year behind
if(as.numeric(sapply(strsplit(date, "-"),"[[", 1)) -
as.numeric(sapply(strsplit(date_ref_compare[i],
"-"),"[[", 1)) == -1){
one_year_behind_diffs[i] = date_diffs[i]
}
}
res_ahead = min_abs_index(one_year_ahead_diffs)
print(res_ahead)
print(one_year_ahead_diffs[res_ahead])
print(one_year_ahead_diffs)
res_behind = min_abs_index(one_year_behind_diffs)
return(c(res_ahead, res_behind))
}
我们将上述函数应用于提供的向量中的每个日期:
vector_of_ahead_indices = rep(0, length(dates$date))
vector_of_behind_indices = rep(0, length(dates$date))
for(i in 1:length(dates$date)){
res = above_below_year_date(dates$date[i], dates$date)
vector_of_ahead_indices[i] = res[1]
vector_of_behind_indices[i] = res[2]
}
dates$nearest_val_nextyear = dates$value[vector_of_behind_indices]
dates$nearest_val_prevyear = dates$value[vector_of_ahead_indices]
然后我们命令更容易手动检查,并重新排序在错误的列中具有NA值的第一年。
#order to make it easier to manually check
dates = dates[order(dates$date), ]
#reorder the first year
dates[1, "nearest_val_nextyear"] = dates[1, "nearest_val_prevyear"]
dates[1, "nearest_val_prevyear"] = NA
最后,这是排序的输出以及原始行名称:
date value nearest_val_nextyear nearest_val_prevyear
10 2007-06-30 5.593901 5.782136 NA
12 2008-05-30 5.782136 6.511781 5.593901
5 2008-08-12 6.124931 5.389843 5.593901
11 2008-08-24 5.918977 5.389843 5.593901
1 2009-02-14 6.511781 3.010648 5.782136
2 2009-12-23 5.389843 4.943871 5.918977
19 2010-01-15 4.521850 4.378759 6.511781
14 2010-01-27 3.010648 4.378759 6.511781
16 2010-12-25 4.943871 4.378759 5.389843
3 2011-08-01 4.378759 5.074565 4.943871
9 2012-01-13 5.821221 5.619826 4.378759
8 2012-04-14 5.943836 5.417942 4.378759
13 2012-06-30 5.074565 5.417942 4.378759
17 2012-09-27 4.844204 5.417942 4.378759
15 2013-02-27 5.619826 4.955066 5.821221
20 2013-03-21 5.417942 4.955066 5.943836
6 2014-03-10 4.955066 NA 5.417942
4 2014-04-07 2.785300 NA 5.417942
7 2014-07-23 4.983810 NA 5.417942
18 2014-12-08 3.529248 NA 5.417942