我有一个数据帧,其时间序列如下所示:
df<-structure(list(date = structure(c(-6905, -6891, -6853, -6588,
-6588, -6586, -6523, -6515, -5856, -5753), class = "Date"), flow = c(2.22,
2.56, 3.3, 1.38, 4, 1.4, 1.32, 1.26, 6, 35.69)), .Names = c("date",
"flow"), row.names = c(NA, 10L), class = "data.frame")
我想删除日期前后2天之内不是最大值的所有行。因此,在上述情况下,第4行和第6行将被删除。我找不到类似的回答问题。
我写的这段代码行不通,而且丑陋,冗长且无法处理数据帧的边缘:
idx <- c()
for (j in 3:(length(df$date)-2)){
if (as.Date(df$date[j+2])-as.Date(df$date[j])<3 |
as.Date(df$date[j])-as.Date(df$date[j-2])<3){
if (df$flow[j]!=max(df$flow[(j-2):(j+2)])){
idx <- c(idx,j)
}
} else if (as.Date(df$date[j+1])-as.Date(df$date[j])<3 |
as.Date(df$date[j])-as.Date(df$date[j-1])<3){
if (df$flow[j]!=max(df$flow[(j-1):(j+1)])){
idx <- c(idx,j)
}
}
}
请注意,数据框中的日期不是连续的。
答案 0 :(得分:3)
使用zoo
库。
library(zoo)
# convert into a zoo time series
dtf.zoo <- zoo(dt$flow, order.by=dt$date)
# remove duplicate dates by keeping the maximum value
dtf.zoo <- aggregate(dtf.zoo, time(dtf.zoo), max)
# pad with NAs to make the time series regular
dtf.zoo <- merge(
dtf.zoo,
zoo(, seq(min(index(dtf.zoo)), max(index(dtf.zoo)), "day"))
)
# find rows that are less than a value two days prior or hence
rem <- which(dtf.zoo < rollapply(dtf.zoo, 5, max, na.rm=TRUE, partial=TRUE))
# remove those rows
dtf.zoo2 <- dtf.zoo[-rem]
# remove NAs
dt2 <- data.frame(flow=na.omit(dtf.zoo2))
dt2
# flow
# 1951-02-05 2.22
# 1951-02-19 2.56
# 1951-03-29 3.30
# 1951-12-19 4.00
# 1952-02-22 1.32
# 1952-03-01 1.26
# 1953-12-20 6.00
# 1954-04-02 35.69
which(!(dt$flow %in% dt2$flow))
# 4 6
答案 1 :(得分:3)
我使用lapply()
检查范围:每个日期的 [日期-2天,日期+ 2天] 。
rm.list <- lapply(df$date, function(x) {
ind <- which(abs(df$date - x) <= 2)
flow <- df$flow[ind]
if(length(ind) > 1) which(flow < max(flow)) + min(ind) - 1
else NULL
})
rm <- unique(unlist(rm.list)) # [1] 4 6
df[-rm, ]
# date flow
# 1 1951-02-05 2.22
# 2 1951-02-19 2.56
# 3 1951-03-29 3.30
# 5 1951-12-19 4.00
# 7 1952-02-22 1.32
# 8 1952-03-01 1.26
# 9 1953-12-20 6.00
# 10 1954-04-02 35.69
答案 2 :(得分:2)
您还可以使用tidyverse
方法:
require(tidyverse)
df %>%
#Arrange by date
arrange(date) %>%
#Picking the max for each da
group_by(date) %>%
top_n(1, flow) %>%
ungroup() %>%
#Adding missing dates with NAs
complete(date = seq.Date(min(date), max(date), by="day")) %>%
#Remove Two up/down
mutate(
remove = case_when(
flow < rowMeans(data.frame(lag(flow, 1),
lag(flow, 2)), na.rm = TRUE) ~ "remove",
flow < rowMeans(data.frame(lead(flow, 1),
lead(flow, 2)), na.rm = TRUE) ~ "remove",
TRUE ~ "keep")) %>%
na.omit() %>%
filter(remove == "keep") %>%
select(-remove)
# A tibble: 8 x 2
date flow
<date> <dbl>
1 1951-02-05 2.22
2 1951-02-19 2.56
3 1951-03-29 3.30
4 1951-12-19 4.00
5 1952-02-22 1.32
6 1952-03-01 1.26
7 1953-12-20 6.00
8 1954-04-02 35.7