我有:
Keyword Date Pos Bid
a 4/11/14 1 5.00
a 4/13/14 1 5.00
a 4/14/14 1 5.00
b 6/2/14 3 9.00
b 7/2/14 4 9.00
b 8/2/14 4 9.00
c 8/29/14 2 3.00
c 8/30/14 2 3.00
c 8/31/14 2 3.00
我需要进行子集化,以便只保留具有最新日期的行:
Keyword Date Pos Bid
a 4/14/14 1 5.00
b 8/2/14 4 9.00
c 8/31/14 2 3.00
我试过了:
Latest = ddply( df,
'Keyword',
function(x) c (
Date = max(as.Date(x$Date, '%m/%d/%y')),
Pos = x$Pos[which(x$Date == max(as.Date(x$Date, '%m/%d/%y')))],
Bid = x$Bid[which(x$Date == max(as.Date(x$Date, '%m/%d/%y')))]
)
)
和
Latest = subset( x,
Date = max(as.Date(Date, '%m/%d/%y')),
select = c('Identity', 'Date', 'Round.Avg.Pos.', 'Search.Bid')
)
但这些或者给我错误或者不是我想要的。我错过了什么?
感谢。
答案 0 :(得分:6)
你可以尝试
library(dplyr)
library(tidyr)
df %>%
mutate(Date=as.Date(Date, format= "%m/%d/%y"))%>%
group_by(Keyword) %>%
arrange(desc(Date)) %>%
slice(1)
# Keyword Date Pos Bid
#1 a 2014-04-14 1 5
#2 b 2014-08-02 4 9
#3 c 2014-08-31 2 3
或者
df %>%
group_by(Keyword) %>%
mutate(Date=as.Date(Date, format= "%m/%d/%y"))%>%
filter(Date==max(Date))
或使用base R
indx <- with(df, ave(as.Date(Date, format="%m/%d/%y"), Keyword, FUN=max))
df[with(df, as.Date(Date, format='%m/%d/%y')==indx),]
# Keyword Date Pos Bid
#3 a 4/14/14 1 5
#6 b 8/2/14 4 9
#9 c 8/31/14 2 3
或使用ddply
ddply(df, .(Keyword), function(x) {
Date=as.Date(x$Date, '%m/%d/%y')
x[Date==max(Date),]})
# Keyword Date Pos Bid
#1 a 4/14/14 1 5
#2 b 8/2/14 4 9
#3 c 8/31/14 2 3
df <- structure(list(Keyword = c("a", "a", "a", "b", "b", "b", "c",
"c", "c"), Date = c("4/11/14", "4/13/14", "4/14/14", "6/2/14",
"7/2/14", "8/2/14", "8/29/14", "8/30/14", "8/31/14"), Pos = c(1L,
1L, 1L, 3L, 4L, 4L, 2L, 2L, 2L), Bid = c(5, 5, 5, 9, 9, 9, 3,
3, 3)), .Names = c("Keyword", "Date", "Pos", "Bid"), class = "data.frame", row.names = c(NA,
-9L))
答案 1 :(得分:3)
或使用data.table
library(data.table)
setDT(df)[ ,.SD[which.max(as.Date(Date, format= "%m/%d/%y"))], by = Keyword]
# Keyword Date Pos Bid
# 1: a 4/14/14 1 5
# 2: b 8/2/14 4 9
# 3: c 8/31/14 2 3
这是使用“split-apply-combine”方法的附加基础R解决方案
do.call(rbind, lapply(split(df, df$Keyword),
function(x) x[which.max(as.Date(x$Date, format='%m/%d/%y')), ]))
# Keyword Date Pos Bid
# a a 4/14/14 1 5
# b b 8/2/14 4 9
# c c 8/31/14 2 3
注意:您想要的输出是以与以前相同的格式离开Date
列,因此我在两个解决方案的每次迭代中应用as.Date
,而最佳练习是将其转换为Date
类一次,然后使用聚合过程中已转换的列
答案 2 :(得分:1)
尝试:
ddf$Date = as.Date(ddf$Date, format("%m/%d/%y"))
ddf= ddf[rev(order(ddf$Date)),]
ddf = ddf[!duplicated(ddf$Keyword),]
ddf[order(ddf$Keyword),]
Keyword Date Pos Bid
3 a 2014-04-14 1 5
6 b 2014-08-02 4 9
9 c 2014-08-31 2 3