我有一个如下数据集:
date, time,product,shop_id
20140104 900 Banana 18
20140104 900 Banana 19
20140104 924 Banana 18
20140104 929 Banana 18
20140104 932 Banana 20
20140104 948 Banana 18
我需要使用不同的product
和不同的shop_id
所以,我需要按product+shop_id
这是我的代码:
library(plyr)
d_ply( shop, .( product,shop_id ),table )
print(p)
不幸的是,它会打印null
数据集:
date=c(20140104,20140104,20140104,20140104,20140104)
time=c(924 ,900,854,700,1450)
product=c(Banana ,Banana ,Banana ,Banana ,Banana)
shop_id=c(18,18,18,19,20)
shop<-data.frame(date=date,time=time,product=product,shop_id=shop_id)
输出应为
date, time, product, shop_id
20140104 900 Banana 19
20140104 932 Banana 20
20140104 948 Banana 18
答案 0 :(得分:0)
我们可以做到
library(tidyverse)
shop %>%
group_by(product, shop_id) %>%
mutate(n = n()) %>%
group_by(time) %>%
arrange(n) %>%
slice(1) %>%
group_by(product, shop_id) %>%
arrange(-time) %>%
slice(1) %>%
select(-n) %>%
arrange(time)
# date time product shop_id
# <int> <int> <chr> <int>
#1 20140104 900 Banana 19
#2 20140104 932 Banana 20
#3 20140104 948 Banana 18
答案 1 :(得分:0)
要仅采用第一个独特组合,只需使用包stats
中的aggregate:
> aggregate(shop, by=list(shop$product, shop$shop_id), FUN=function(x){x[1]})
Group.1 Group.2 date time product shop_id
1 Banana 18 20140104 924 Banana 18
2 Banana 19 20140104 700 Banana 19
3 Banana 20 20140104 1450 Banana 20
说明:我的FUN=function(x){x[1]}
在碰撞时仅采用第一个元素
删除“Group.1”,“Group.2”或其他列:
> res <- aggregate(shop, by=list(shop$product, shop$shop_id), FUN=function(x){x[1]})
> res[ , !(names(res) %in% c("Group.1", "Group.2"))]
date time product shop_id
1 20140104 924 Banana 18
2 20140104 700 Banana 19
3 20140104 1450 Banana 20
P.S。您提供的数据集与您需要的示例不一致,因此这就是数字存在差异的原因。
P.S.2如果您想在发生碰撞时获取所有数据:
> aggregate(shop, by=list(shop$product, shop$shop_id), FUN="identity")
Group.1 Group.2 date time product shop_id
1 Banana 18 20140104, 20140104, 20140104 924, 900, 854 1, 1, 1 18, 18, 18
2 Banana 19 20140104 700 1 19
3 Banana 20 20140104 1450 1 20
如果您想标记碰撞:
> aggregate(shop, by=list(shop$product, shop$shop_id), FUN=function(x){if (length(x) > 1) NA else x})
Group.1 Group.2 date time product shop_id
1 Banana 18 NA NA NA NA
2 Banana 19 20140104 700 1 19
3 Banana 20 20140104 1450 1 20
如果要排除非唯一行:
> res <- aggregate(shop, by=list(shop$product, shop$shop_id), FUN=function(x){if (length(x) > 1) NULL else x})
> res[res$product != "NULL", !(names(res) %in% c("Group.1", "Group.2"))]
date time product shop_id
2 20140104 700 1 19
3 20140104 1450 1 20
如果你想避免从String到Int(对于产品)的强制,使用“”/“NULL”/“NA”而不是NULL / NA。
答案 2 :(得分:0)
可以使用dplyr完成,如下所示:
# create the sample dataset
date=c(20140104,20140104,20140104,20140104,20140104)
time=c(924 ,900,854,700,1450)
product=c("Banana","Banana","Banana","Banana","Banana")
shop_id=c(18,18,18,19,20)
shop<-data.frame(date=date,time=time,product=product,shop_id=shop_id)
# load a dplyr library
library(dplyr)
# take shop data
shop %>%
# group by product, shop id, date
group_by(product, shop_id, date) %>%
# for each such combination, find the earliest time
summarise(time = min(time)) %>%
# group by product, shop id
group_by(product, shop_id) %>%
# for each combination of product & shop id
# return the earliest date and time recorded on the earliest date
summarise(date = min(date), time = time[date == min(date)])