我的数据在excel中看起来像这样,并扩展到更多(Date5,Date6 ....)
Date1 Value1 Date2 Value2 Date3 Value3 Date4 Value4
1/2/2004 17 1/3/2004 27 1/1/2004 17 1/3/2004 31
1/3/2004 26 1/4/2004 30 1/3/2004 29 1/4/2004 36
1/4/2004 22 1/5/2004 22 1/4/2004 28 1/5/2004 33
1/5/2004 17 1/6/2004 28 1/5/2004 36 1/6/2004 50
1/13/2004 15 1/7/2004 17 1/12/2004 15 1/8/2004 9
1/14/2004 10 1/14/2004 21 1/14/2004 12 1/14/2004 11
我想排除所有系列中不存在关联日期的任何值。
我发布的样本数据结果如下:
Date Value1 Value2 Value3 Value4
1/3/2004 26 27 29 31
1/4/2004 22 30 28 36
1/5/2004 17 22 36 33
1/14/2004 10 21 12 11
答案 0 :(得分:3)
Date <- Reduce(intersect, list(df$Date1, df$Date2, df$Date3, df$Date4))
Value1 <- df[df$Date1 %in% Date, ]$Value1
Value2 <- df[df$Date2 %in% Date, ]$Value2
Value3 <- df[df$Date3 %in% Date, ]$Value3
Value4 <- df[df$Date4 %in% Date, ]$Value4
data.frame(Date, Value1, Value2, Value3, Value4)
# Date Value1 Value2 Value3 Value4
# 1 1/3/2004 26 27 29 31
# 2 1/4/2004 22 30 28 36
# 3 1/5/2004 17 22 36 33
# 4 1/14/2004 10 21 12 11
正如@docendo提到的那样,在多列的情况下,这可能会很长,更新的方式将是
Date <- Reduce(intersect, list(df$Date1, df$Date2, df$Date3, df$Date4))
Values <- df[, seq(0, ncol(df), by=2)]
Dates <- df[, seq(1, ncol(df), by=2)]
mat <- apply(Dates, 2, function(x) {x %in% Date})
data.frame(Date, matrix(Values[mat], nrow = 4))
# Date X1 X2 X3 X4
# 1 1/3/2004 26 27 29 31
# 2 1/4/2004 22 30 28 36
# 3 1/5/2004 17 22 36 33
# 4 1/14/2004 10 21 12 11
根据@David的评论,使用
可以进一步改善这一点Values <- df[c(FALSE, TRUE)]
Dates <- df[c(TRUE, FALSE)]
Date <- Reduce(intersect, as.list(Dates))
mat <- apply(Dates, 2, function(x) {x %in% Date})
data.frame(Date, matrix(Values[mat], nrow = ncol(df)/2))
# Date X1 X2 X3 X4
# 1 1/3/2004 26 27 29 31
# 2 1/4/2004 22 30 28 36
# 3 1/5/2004 17 22 36 33
# 4 1/14/2004 10 21 12 11
答案 1 :(得分:3)
这是dplyr
/ tidyr
方法:
library(dplyr); library(tidyr)
gather(DF, key1, Date, -starts_with("Value")) %>%
gather(key2, Val, starts_with("Value")) %>%
filter(Date %in% Reduce(intersect, select(DF, starts_with("Date"))) &
gsub("[^0-9]", "", key1) == gsub("[^0-9]", "", key2)) %>%
select(-key1) %>% spread(key2, Val)
# Date Value1 Value2 Value3 Value4
#1 1/14/2004 10 21 12 11
#2 1/3/2004 26 27 29 31
#3 1/4/2004 22 30 28 36
#4 1/5/2004 17 22 36 33
#Warning:
#attributes are not identical across measure variables; they will be dropped
警告与转换为factor
的{{1}}列有关。
-
在@ AntoniosK的评论之后编辑
答案 2 :(得分:1)
我的尝试没有额外的包:
d <- read.table(header=TRUE, text=
'Date1 Value1 Date2 Value2 Date3 Value3 Date4 Value4
1/2/2004 17 1/3/2004 27 1/1/2004 17 1/3/2004 31
1/3/2004 26 1/4/2004 30 1/3/2004 29 1/4/2004 36
1/4/2004 22 1/5/2004 22 1/4/2004 28 1/5/2004 33
1/5/2004 17 1/6/2004 28 1/5/2004 36 1/6/2004 50
1/13/2004 15 1/7/2004 17 1/12/2004 15 1/8/2004 9
1/14/2004 10 1/14/2004 21 1/14/2004 12 1/14/2004 11')
l <- length(d) %/% 2
D <- "Date"
dneu.i <- function(i) {
di <- d[, (2*i-1):(2*i)]
names(di) <- c("Date", "Value")
di$I <- paste0(D, i)
di
}
dneu <- dneu.i(1)
for (i in 2:l) dneu <- rbind(dneu, dneu.i(i))
dneu.w <- reshape(dneu, dir="wide", idvar="Date", timevar="I")
subset(dneu.w, apply(dneu.w[,-1], 1, function(x) !any(is.na(x))))
答案 3 :(得分:1)
dt = read.table(text="Date1 Value1 Date2 Value2 Date3 Value3 Date4 Value4
1/2/2004 17 1/3/2004 27 1/1/2004 17 1/3/2004 31
1/3/2004 26 1/4/2004 30 1/3/2004 29 1/4/2004 36
1/4/2004 22 1/5/2004 22 1/4/2004 28 1/5/2004 33
1/5/2004 17 1/6/2004 28 1/5/2004 36 1/6/2004 50
1/13/2004 15 1/7/2004 17 1/12/2004 15 1/8/2004 9
1/14/2004 10 1/14/2004 21 1/14/2004 12 1/14/2004 11", header=T)
library(dplyr)
library(tidyr)
dt %>%
select(starts_with("Date")) %>% ## get the dates columns
gather(DateGroup,Date,starts_with("Date")) %>% ## reshape them to create a single column of dates and in which initial column they belong
cbind(dt %>%
select(starts_with("Value")) %>% ## get the values columns
gather(ValueGroup,Value,starts_with("Value"))) %>% ## reshape them to create a single column of values and in which initial column they belong
group_by(Date) %>% ## for each date
mutate(Group_count = n_distinct(DateGroup)) %>% ## count in how many inital columns they exist
ungroup() %>% ## forget about the grouping
filter(Group_count == length(unique(DateGroup))) %>% ## keep columns that exists in all initial columns
select(Date, ValueGroup, Value) %>% ## select appropriate columns
spread(ValueGroup, Value) ## reshape dataset
# Date Value1 Value2 Value3 Value4
# (chr) (int) (int) (int) (int)
# 1 1/14/2004 10 21 12 11
# 2 1/3/2004 26 27 29 31
# 3 1/4/2004 22 30 28 36
# 4 1/5/2004 17 22 36 33
答案 4 :(得分:0)
我添加答案有点迟了。无论如何...
显然,你正在处理一个多变量的时间序列。因此,您应该使用时间序列对象(如zoo
)来存储数据。最近我在SO上看到了很多问题,其中人们使用数据帧和矩阵来存储多变量时间序列对象。我强烈建议你不要这样做。
以下是我使用zoo
的解决方案:
library(zoo)
do.call("merge",c(lapply(split(1:ncol(d),sort(rep(1:(ncol(d)/2),times=2))),
function(x) zoo(d[,x[2],drop=FALSE],
as.Date(d[,x[1]],format="%m/%d/%Y"))),
all=FALSE))
结果是
# Value1 Value2 Value3 Value4
#2004-01-03 26 27 29 31
#2004-01-04 22 30 28 36
#2004-01-05 17 22 36 33
#2004-01-14 10 21 12 11