具有时间窗口和相等/不同列条目的子集数据

时间:2016-08-08 10:07:21

标签: r

A = c(100,101,102,103,104,105)
B = c(99,99,100,99.5,98,101)
C = c("D","S","D","D","S","D")
D = c(0110,0110,0110,0110,0110,0110)
data <- data.frame(A,B,C,D)

大家好, 我有这个庞大的数据集,我正试图将其分组。我在上面举了一个例子或简化,希望有助于理解我的问题。 基本上,我正在寻找所有行,其中B列和D列中的条目相等。对于这些行,列A中的条目之间的差异(即时间差异)应该相等或在一秒内。除此之外,行应在C列中具有不同的条目。 因此,我正在寻找的代码应返回数据,其中列出了所有这些行。 对于我的示例,输出数据应如下所示。

[1] 100 99 D 110
[2] 101 99 S 110

结构(列表(trd_exctn_dt = c(20130430L,20130430L,20130430L, 20130430L,20130430L,20130430L,20130430L,20130430L,20130430L, 20130430L),trd_exctn_tm = c(62076,62099,62110,62120,62126, 62173,62215,62231,62266,62280),trd_rpt_dt = c(20130430L, 20130430L,20130430L,20130430L,20130430L,20130430L,20130430L, 20130430L,20130430L,20130430L),trd_rpt_tm = c(62077,62150, 62111,62121,62127,62174,62218,62232,62326,62283),msg_seq_nb = c(77464L, 77513L,77483L,77493L,77499L,77526L,77550L,77556L,77609L, 77577L),entrd_vol_qt = c(1e + 06,2e + 06,250000,1e + 06,1e + 06, 1e + 06,1e + 06,2e + 06,2e + 06,1e + 06),rptd_pr = c(100.337,99.922, 100.337,99.922,100.391,100.059,100.391,100.426,100.434, 100.391),rpt_side_cd =结构(c(1L,1L,1L,1L,1L,1L,2L, 2L,2L,2L),. Label = c(“B”,“S”),class =“factor”)),. Name = c(“trd_exctn_dt”, “trd_exctn_tm”,“trd_rpt_dt”,“trd_rpt_tm”,“msg_seq_nb”,“entrd_vol_qt”, “rptd_pr”,“rpt_side_cd”),class =“data.frame”,row.names = c(12791L, 474L,33360L,467L,12795L,45846L,12804L,12806L,33373L,12810L ))

1 个答案:

答案 0 :(得分:0)

我已经使用了您上面提供的数据,它似乎对我有用。

library(dplyr)

dt = structure(list(trd_exctn_dt = c(20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L), 
                    trd_exctn_tm = c(62076, 62099, 62110, 62120, 62126, 62173, 62215, 62231, 62266, 62280), 
                    trd_rpt_dt = c(20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L, 20130430L),
                    trd_rpt_tm = c(62077, 62150, 62111, 62121, 62127, 62174, 62218, 62232, 62326, 62283), 
                    msg_seq_nb = c(77464L, 77513L, 77483L, 77493L, 77499L, 77526L, 77550L, 77556L, 77609L, 77577L), 
                    entrd_vol_qt = c(1e+06, 2e+06, 250000, 1e+06, 1e+06, 1e+06, 1e+06, 2e+06, 2e+06, 1e+06),
                    rptd_pr = c(100.337, 99.922, 100.337, 99.922, 100.391, 100.059, 100.391, 100.426, 100.434, 100.391), 
                    rpt_side_cd = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), 
                                            .Label = c("B", "S"), class = "factor")), 
               .Names = c("trd_exctn_dt", "trd_exctn_tm", "trd_rpt_dt", "trd_rpt_tm", "msg_seq_nb", "entrd_vol_qt", "rptd_pr", "rpt_side_cd"), 
               class = "data.frame", row.names = c(12791L, 474L, 33360L, 467L, 12795L, 45846L, 12804L, 12806L, 33373L, 12810L ))

dt

#       trd_exctn_dt trd_exctn_tm trd_rpt_dt trd_rpt_tm msg_seq_nb entrd_vol_qt rptd_pr rpt_side_cd
# 12791     20130430        62076   20130430      62077      77464      1000000 100.337           B
# 474       20130430        62099   20130430      62150      77513      2000000  99.922           B
# 33360     20130430        62110   20130430      62111      77483       250000 100.337           B
# 467       20130430        62120   20130430      62121      77493      1000000  99.922           B
# 12795     20130430        62126   20130430      62127      77499      1000000 100.391           B
# 45846     20130430        62173   20130430      62174      77526      1000000 100.059           B
# 12804     20130430        62215   20130430      62218      77550      1000000 100.391           S
# 12806     20130430        62231   20130430      62232      77556      2000000 100.426           S
# 33373     20130430        62266   20130430      62326      77609      2000000 100.434           S
# 12810     20130430        62280   20130430      62283      77577      1000000 100.391           S



dt %>% 
  arrange(trd_exctn_tm) %>%                              # order time column
  group_by(rptd_pr, trd_exctn_dt) %>%                    # group by appropriate columns
  mutate(Count1 = n(),                                   # count how many rows there are for each combination
         Count2 = n_distinct(rpt_side_cd)) %>%           # count how many unique values of last column there are for each combination
  filter(Count1 > 1 & Count2 > 1) %>%                    # keep only combinations with multiple rows and multiple unique values in last column
  mutate(Dist = trd_exctn_tm - min(trd_exctn_tm)) %>%    # calculate distance based on time column
  filter(Dist <= 160) %>%                                # keep only rows with appropriate distance 
  distinct(rpt_side_cd, .keep_all=T) %>%                 # select distinct values (order based on time column)
  ungroup() %>%                                          # forget the grouping
  select(-Count1, -Count2, -Dist)

# # A tibble: 2 x 8
#    trd_exctn_dt trd_exctn_tm trd_rpt_dt trd_rpt_tm msg_seq_nb entrd_vol_qt rptd_pr rpt_side_cd
#           <int>        <dbl>      <int>      <dbl>      <int>        <dbl>   <dbl>      <fctr>
# 1     20130430        62126   20130430      62127      77499        1e+06 100.391           B
# 2     20130430        62215   20130430      62218      77550        1e+06 100.391           S

如果这是您想要的输出,您必须检查您使用的是最新版本的R(3.3.1)和dplyr(0.5.0),因为这正是我正在使用的。

如果您希望将tibble输出保存为新数据框,则只需执行

dt2 = dt %>% ...