A = c(100,101,102,103,104,105)
B = c(99,99,100,99.5,98,101)
C = c("D","S","D","D","S","D")
D = c(0110,0110,0110,0110,0110,0110)
data <- data.frame(A,B,C,D)
大家好, 我有这个庞大的数据集,我正试图将其分组。我在上面举了一个例子或简化,希望有助于理解我的问题。 基本上,我正在寻找所有行,其中B列和D列中的条目相等。对于这些行,列A中的条目之间的差异(即时间差异)应该相等或在一秒内。除此之外,行应在C列中具有不同的条目。 因此,我正在寻找的代码应返回数据,其中列出了所有这些行。 对于我的示例,输出数据应如下所示。
[1] 100 99 D 110
[2] 101 99 S 110
结构(列表(trd_exctn_dt = c(20130430L,20130430L,20130430L, 20130430L,20130430L,20130430L,20130430L,20130430L,20130430L, 20130430L),trd_exctn_tm = c(62076,62099,62110,62120,62126, 62173,62215,62231,62266,62280),trd_rpt_dt = c(20130430L, 20130430L,20130430L,20130430L,20130430L,20130430L,20130430L, 20130430L,20130430L,20130430L),trd_rpt_tm = c(62077,62150, 62111,62121,62127,62174,62218,62232,62326,62283),msg_seq_nb = c(77464L, 77513L,77483L,77493L,77499L,77526L,77550L,77556L,77609L, 77577L),entrd_vol_qt = c(1e + 06,2e + 06,250000,1e + 06,1e + 06, 1e + 06,1e + 06,2e + 06,2e + 06,1e + 06),rptd_pr = c(100.337,99.922, 100.337,99.922,100.391,100.059,100.391,100.426,100.434, 100.391),rpt_side_cd =结构(c(1L,1L,1L,1L,1L,1L,2L, 2L,2L,2L),. Label = c(“B”,“S”),class =“factor”)),. Name = c(“trd_exctn_dt”, “trd_exctn_tm”,“trd_rpt_dt”,“trd_rpt_tm”,“msg_seq_nb”,“entrd_vol_qt”, “rptd_pr”,“rpt_side_cd”),class =“data.frame”,row.names = c(12791L, 474L,33360L,467L,12795L,45846L,12804L,12806L,33373L,12810L ))
答案 0 :(得分:0)
我已经使用了您上面提供的数据,它似乎对我有用。
library(dplyr)
dt = structure(list(trd_exctn_dt = c(20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L),
trd_exctn_tm = c(62076, 62099, 62110, 62120, 62126, 62173, 62215, 62231, 62266, 62280),
trd_rpt_dt = c(20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L),
trd_rpt_tm = c(62077, 62150, 62111, 62121, 62127, 62174, 62218, 62232, 62326, 62283),
msg_seq_nb = c(77464L, 77513L, 77483L, 77493L, 77499L, 77526L, 77550L, 77556L, 77609L, 77577L),
entrd_vol_qt = c(1e+06, 2e+06, 250000, 1e+06, 1e+06, 1e+06, 1e+06, 2e+06, 2e+06, 1e+06),
rptd_pr = c(100.337, 99.922, 100.337, 99.922, 100.391, 100.059, 100.391, 100.426, 100.434, 100.391),
rpt_side_cd = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
.Label = c("B", "S"), class = "factor")),
.Names = c("trd_exctn_dt", "trd_exctn_tm", "trd_rpt_dt", "trd_rpt_tm", "msg_seq_nb", "entrd_vol_qt", "rptd_pr", "rpt_side_cd"),
class = "data.frame", row.names = c(12791L, 474L, 33360L, 467L, 12795L, 45846L, 12804L, 12806L, 33373L, 12810L ))
dt
# trd_exctn_dt trd_exctn_tm trd_rpt_dt trd_rpt_tm msg_seq_nb entrd_vol_qt rptd_pr rpt_side_cd
# 12791 20130430 62076 20130430 62077 77464 1000000 100.337 B
# 474 20130430 62099 20130430 62150 77513 2000000 99.922 B
# 33360 20130430 62110 20130430 62111 77483 250000 100.337 B
# 467 20130430 62120 20130430 62121 77493 1000000 99.922 B
# 12795 20130430 62126 20130430 62127 77499 1000000 100.391 B
# 45846 20130430 62173 20130430 62174 77526 1000000 100.059 B
# 12804 20130430 62215 20130430 62218 77550 1000000 100.391 S
# 12806 20130430 62231 20130430 62232 77556 2000000 100.426 S
# 33373 20130430 62266 20130430 62326 77609 2000000 100.434 S
# 12810 20130430 62280 20130430 62283 77577 1000000 100.391 S
dt %>%
arrange(trd_exctn_tm) %>% # order time column
group_by(rptd_pr, trd_exctn_dt) %>% # group by appropriate columns
mutate(Count1 = n(), # count how many rows there are for each combination
Count2 = n_distinct(rpt_side_cd)) %>% # count how many unique values of last column there are for each combination
filter(Count1 > 1 & Count2 > 1) %>% # keep only combinations with multiple rows and multiple unique values in last column
mutate(Dist = trd_exctn_tm - min(trd_exctn_tm)) %>% # calculate distance based on time column
filter(Dist <= 160) %>% # keep only rows with appropriate distance
distinct(rpt_side_cd, .keep_all=T) %>% # select distinct values (order based on time column)
ungroup() %>% # forget the grouping
select(-Count1, -Count2, -Dist)
# # A tibble: 2 x 8
# trd_exctn_dt trd_exctn_tm trd_rpt_dt trd_rpt_tm msg_seq_nb entrd_vol_qt rptd_pr rpt_side_cd
# <int> <dbl> <int> <dbl> <int> <dbl> <dbl> <fctr>
# 1 20130430 62126 20130430 62127 77499 1e+06 100.391 B
# 2 20130430 62215 20130430 62218 77550 1e+06 100.391 S
如果这是您想要的输出,您必须检查您使用的是最新版本的R(3.3.1)和dplyr
(0.5.0),因为这正是我正在使用的。
如果您希望将tibble输出保存为新数据框,则只需执行
dt2 = dt %>% ...