ex <- structure(list(group = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 2, 2, 2, 2), timestamp = structure(c(1504975114, 1504975115,
1504975116, 1504975116, 1504975121, 1504975121, 1504975121, 1504975121,
1504963482, 1504963486, 1504963486, 1504964343, 1504964343, 1504964394,
1504964394, 1504964394, 1504964394), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), subgroup = c(36L, 36L, 36L, 35L, 36L, 35L,
35L, 36L, 43L, 43L, 14L, 14L, 14L, 14L, 14L, 43L, 43L), A = c(1L,
49L, 1L, 74L, 12L, 61L, 5L, 5L, 1L, 30L, 30L, 18L, 19L, 32L,
40L, 32L, 40L), B = c(1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("group", "timestamp",
"subgroup", "A", "B"), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -17L))
我有类似上面的数据。我想按时间戳对group
中的数据进行排序,但还要注意如何处理时间戳中的联系。确切地说,如果两个观测值具有相同的时间戳,则我想首先拥有一个观测值,它的ID subgroup
与前一个时间戳的值相同。因此所需的输出如下所示:
# A tibble: 17 x 5
group timestamp subgroup A B
<dbl> <dttm> <int> <int> <int>
1 1.00 2017-09-09 16:38:34 36 1 1
2 1.00 2017-09-09 16:38:35 36 49 1
3 1.00 2017-09-09 16:38:36 36 1 0
4 1.00 2017-09-09 16:38:36 35 74 1
5 1.00 2017-09-09 16:38:41 35 61 1
6 1.00 2017-09-09 16:38:41 35 5 0
7 1.00 2017-09-09 16:38:41 36 12 1
8 1.00 2017-09-09 16:38:41 36 5 1
9 2.00 2017-09-09 13:24:42 43 1 1
10 2.00 2017-09-09 13:24:46 43 30 1
11 2.00 2017-09-09 13:24:46 14 30 1
12 2.00 2017-09-09 13:39:03 14 18 1
13 2.00 2017-09-09 13:39:03 14 19 1
14 2.00 2017-09-09 13:39:54 14 32 1
15 2.00 2017-09-09 13:39:54 14 40 1
16 2.00 2017-09-09 13:39:54 43 32 1
17 2.00 2017-09-09 13:39:54 43 40 1
我该怎么做?
答案 0 :(得分:2)
这里是使用tidyverse
的想法:
library(tidyverse)
ex %>%
group_by(group) %>%
mutate(order = map2(
split_ <- split(subgroup,timestamp),
accumulate(split_, ~intersect(c(rev(.x),.y),.y)),
match) %>% unlist) %>%
arrange(group,timestamp,order)
# # A tibble: 17 x 6
# # Groups: group [2]
# group timestamp subgroup A B order
# <dbl> <dttm> <int> <int> <int> <int>
# 1 1 2017-09-09 16:38:34 36 1 1 1
# 2 1 2017-09-09 16:38:35 36 49 1 1
# 3 1 2017-09-09 16:38:36 36 1 0 1
# 4 1 2017-09-09 16:38:36 35 74 1 2
# 5 1 2017-09-09 16:38:41 35 61 1 1
# 6 1 2017-09-09 16:38:41 35 5 0 1
# 7 1 2017-09-09 16:38:41 36 12 1 2
# 8 1 2017-09-09 16:38:41 36 5 1 2
# 9 2 2017-09-09 13:24:42 43 1 1 1
# 10 2 2017-09-09 13:24:46 43 30 1 1
# 11 2 2017-09-09 13:24:46 14 30 1 2
# 12 2 2017-09-09 13:39:03 14 18 1 1
# 13 2 2017-09-09 13:39:03 14 19 1 1
# 14 2 2017-09-09 13:39:54 14 32 1 1
# 15 2 2017-09-09 13:39:54 14 40 1 1
# 16 2 2017-09-09 13:39:54 43 32 1 2
# 17 2 2017-09-09 13:39:54 43 40 1 2
我假设时间戳是预先排序的,如果没有,则首先使用ex %>% arrange(group, timestamp) %>% ...
进行排序。
您可以添加%>% select(-order) %>% ungroup
来精确获得所需的输出(我将其保留为这种方式,以便于理解)。
让我们仅保留第1组来说明mutate调用中发生的情况:
ex1 <- filter(ex, group==1)
对于每个时间戳,我们列出一个子组列表:
split_ <- split(ex1$subgroup,ex1$timestamp)
# $`2017-09-09 16:38:34`
# [1] 36
#
# $`2017-09-09 16:38:35`
# [1] 36
#
# $`2017-09-09 16:38:36`
# [1] 36 35
#
# $`2017-09-09 16:38:41`
# [1] 36 35 35 36
应该更改最后一项的顺序,35
应该在36
之前,因为它在第3个元素中最后使用。由于intersect
将项目的顺序保留在第一个参数中,因此我可以像这样获得最后一个项目的正确顺序:
intersect(c(rev(split_[[3]]), split_[[4]]),
split_[[4]])
# [1] 35 36
要将这种转换应用于所有使用purrr::accumulate
的元素,因为我总是需要最后一个计算顺序来计算下一个:
acc_ <- accumulate(split_, ~intersect(c(rev(.x),.y),.y))
# [[1]]
# [1] 36
#
# [[2]]
# [1] 36
#
# [[3]]
# [1] 36 35
#
# [[4]]
# [1] 35 36
如果将split_
和acc_
与match
一起使用,我可以得到比这些元素在输出中应有的顺序
map2(split_ , acc_, match)
# $`2017-09-09 16:38:34`
# [1] 1
#
# $`2017-09-09 16:38:35`
# [1] 1
#
# $`2017-09-09 16:38:36`
# [1] 1 2
#
# $`2017-09-09 16:38:41`
# [1] 2 1 1 2
然后我可以unlist
进入我的order_
列,并按order_
进行排序以获得所需的输出。
答案 1 :(得分:0)
这是通常可以使用的代码。
library(dplyr)
ex %>%
arrange(group,timestamp,subgroup)
但这会产生此输出
group timestamp subgroup A B
<dbl> <dttm> <int> <int> <int>
1 1. 2017-09-09 16:38:34 36 1 1
2 1. 2017-09-09 16:38:35 36 49 1
3 1. 2017-09-09 16:38:36 36 1 0
4 1. 2017-09-09 16:38:36 35 74 1
5 1. 2017-09-09 16:38:41 36 12 1
6 1. 2017-09-09 16:38:41 35 61 1
7 1. 2017-09-09 16:38:41 35 5 0
8 1. 2017-09-09 16:38:41 36 5 1
9 2. 2017-09-09 13:24:42 43 1 1
10 2. 2017-09-09 13:24:46 43 30 1
原因是日期时间对象也存储秒的分数,因此,尽管第5行和第6行的时间戳似乎相同,但事实并非如此。您可以快速as.numeric(ex$timestamp)
来进行检查。
现在,我在下面的代码中舍入了微秒,以产生您似乎想要的输出。
library(dplyr)
library(lubridate) # to use round_date
ex %>%
arrange(group,round_date(timestamp, ".5s"),subgroup)
group timestamp subgroup A B
<dbl> <dttm> <int> <int> <int>
1 1. 2017-09-09 16:38:34 36 1 1
2 1. 2017-09-09 16:38:35 36 49 1
3 1. 2017-09-09 16:38:36 35 74 1
4 1. 2017-09-09 16:38:36 36 1 0
5 1. 2017-09-09 16:38:41 35 61 1
6 1. 2017-09-09 16:38:41 35 5 0
7 1. 2017-09-09 16:38:41 36 12 1
8 1. 2017-09-09 16:38:41 36 5 1
9 2. 2017-09-09 13:24:42 43 1 1
10 2. 2017-09-09 13:24:46 14 30 1
11 2. 2017-09-09 13:24:46 43 30 1
12 2. 2017-09-09 13:39:03 14 18 1
13 2. 2017-09-09 13:39:03 14 19 1
14 2. 2017-09-09 13:39:54 14 32 1
15 2. 2017-09-09 13:39:54 14 40 1
16 2. 2017-09-09 13:39:54 43 32 1
17 2. 2017-09-09 13:39:54 43 40 1
答案 2 :(得分:0)
这将产生所需的结果。我想知道是否有更简单的方法
library(dplyr)
ex1<-ex %>%
mutate(timestamp=as.POSIXct(as.character(timestamp))) %>%
arrange(group,timestamp) %>%
group_by(group,timestamp) %>%
mutate(order=0,subgroup_lag=0) # initialising variable to use in the for loop
ex1$group_id<- group_indices(ex,group,timestamp) # creating a group_index, this will be used for splitting the dataset into multiple datasets
ex_list<-split(ex1,ex1$group_id) # split by group_index
# this will create a list with datasets that have the same Group, and timestamp
for (i in 2: length(ex_list)){ # for each dataframe in the list ex_list
if (nrow(as.data.frame(ex_list[[i]]))>1){
ex_list[[i]]$subgroup_lag<-ex_list[[i-1]][nrow(ex_list[[i-1]]),]$subgroup #if there are multiple rows in a dataframe, obtain the subgroup value from the previous row
#calling that value to be compared subgroup_lag
if(ex_list[[i]]$subgroup_lag>0){
ex_list[[i]]$order <- ifelse(ex_list[[i]]$subgroup_lag == ex_list[[i]]$subgroup,1,0 ) #identify rows that have the same subgroup id as the subgroup_lag value, then mark order as 1
# this is to sort easily in the next step
}
}
ex_list[[i]] <- ex_list[[i]] %>%
arrange(desc(order)) # sort by order
}
df<-do.call(rbind,ex_list) # bind rows into a dataframe
df
输出如下
group timestamp subgroup A B
<dbl> <dttm> <int> <int> <int>
1 1. 2017-09-09 16:38:34 36 1 1
2 1. 2017-09-09 16:38:35 36 49 1
3 1. 2017-09-09 16:38:36 36 1 0
4 1. 2017-09-09 16:38:36 35 74 1
5 1. 2017-09-09 16:38:41 35 61 1
6 1. 2017-09-09 16:38:41 35 5 0
7 1. 2017-09-09 16:38:41 36 12 1
8 1. 2017-09-09 16:38:41 36 5 1
9 2. 2017-09-09 13:24:42 43 1 1
10 2. 2017-09-09 13:24:46 43 30 1
11 2. 2017-09-09 13:24:46 14 30 1
12 2. 2017-09-09 13:39:03 14 18 1
13 2. 2017-09-09 13:39:03 14 19 1
14 2. 2017-09-09 13:39:54 14 32 1
15 2. 2017-09-09 13:39:54 14 40 1
16 2. 2017-09-09 13:39:54 43 32 1
17 2. 2017-09-09 13:39:54 43 40 1
答案 3 :(得分:0)
矢量化解决方案。但恐怕它并不比for循环更有效
vector_f<- function() {
ex$id<-seq_along(ex$group)
ex1<-ex %>%
mutate(timestamp=as.POSIXct(as.character(timestamp)))
ex1$group_id<- as.numeric(group_indices(ex1,group,timestamp))
df_list<- list()
for (i in 2:max(ex1$group_id)){
df_list[[i]]<- ex1 %>%
filter(group_id %in% c(i-1,i,i+1)) %>%
arrange(group,timestamp) %>%
group_by(group,timestamp) %>%
mutate(subgroup_1=last(subgroup)) %>%
ungroup() %>%
mutate(temp= lag(subgroup_1,n=1)) %>%
group_by(group,timestamp) %>%
mutate(subgroup_lag= first(temp,n=1)) %>%
mutate(order =ifelse(subgroup_lag == subgroup,1,0 ) ) %>%
arrange(group,timestamp,desc(order)) %>%
ungroup() %>%
filter(group_id %in% c(i))
}
df_list[[1]]<- ex1 %>%
filter(group_id ==1 ) %>%
mutate(subgroup_1=0,order=0,temp=0,subgroup_lag=0) %>%
ungroup()
df<-do.call(rbind,df_list)
print(df)
}
当我使用您提供的数据集进行比较时,for循环显示出更好的结果>其原因是,即使进行矢量化处理时,我们也被迫将数据集分成多个组进行比较。坦白地说,这是一种更复杂的解决方案,矢量化的所有好处都消失了
> microbenchmark(vector_f(), for_f(), times=100)
Unit: milliseconds
expr min lq mean median uq max neval
vector_f() 58.03299 66.40527 73.79760 70.92226 78.58620 115.33876 100
for_f() 12.64291 13.80850 16.32043 16.10607 17.63527 27.66872 100
我还运行了代码,创建了一个850万行的虚拟数据集,这是系统时间。我建议删除不需要的列,然后在完成此处理后将其重新添加
system.time(vector_f())
user system elapsed
244.47 3.40 248.12
system.time(for_f())
user system elapsed
218.61 0.71 219.50
使用基于data.table的解决方案可能会更快