在R中滚动时间窗口上聚合数据

时间:2018-02-19 06:35:26

标签: r dplyr aggregate

我有一个数据帧,它基本上是我已经解析并制作成综合数据帧的日志数据

示例输入:

    Raw_log           Timestamp
1           2018-02-13 12:42:08.000 0.030 UDP 192.168.1.162:52085 -> 103.8.44.5:53 1 65 1 2018-02-13 12:42:08
2          2018-02-13 12:42:08.000 0.030 UDP 103.8.44.5:53 -> 192.168.1.162:52085 1 105 1 2018-02-13 12:42:08
3     2018-02-13 12:40:08.000 0.150 TCP 192.168.1.124:52812 -> 216.58.221.35:443 10 709 1 2018-02-13 12:40:08
4    2018-02-13 12:40:08.000 0.150 TCP 216.58.221.35:443 -> 192.168.1.124:52812 10 5130 1 2018-02-13 12:40:08
5       2018-02-13 12:40:32.000 0.126 TCP 192.168.1.173:57140 -> 167.89.123.54:80 5 988 1 2018-02-13 12:40:32
6       2018-02-13 12:40:32.000 0.126 TCP 167.89.123.54:80 -> 192.168.1.173:57140 4 507 1 2018-02-13 12:40:32
7   2018-02-13 12:42:08.000 0.030 UDP 192.168.1.124:41809 -> 24.156.242.137:57274 1 131 1 2018-02-13 12:42:08
8   2018-02-13 12:42:08.000 0.030 UDP 24.156.242.137:57274 -> 192.168.1.124:41809 1 317 1 2018-02-13 12:42:08
9    2018-02-13 12:40:37.000 0.121 TCP 192.168.1.170:50277 -> 216.58.196.202:443 9 1298 1 2018-02-13 12:40:37
10 2018-02-13 12:40:37.000 0.121 TCP 216.58.196.202:443 -> 192.168.1.170:50277 20 18557 1 2018-02-13 12:40:37
   Duration Protocol         src_ip src_port        dest_ip dest_port Packets Bytes Flows
1     0.030      UDP  192.168.1.162    52085     103.8.44.5        53       1    65     1
2     0.030      UDP     103.8.44.5       53  192.168.1.162     52085       1   105     1
3     0.150      TCP  192.168.1.124    52812  216.58.221.35       443      10   709     1
4     0.150      TCP  216.58.221.35      443  192.168.1.124     52812      10  5130     1
5     0.126      TCP  192.168.1.173    57140  167.89.123.54        80       5   988     1
6     0.126      TCP  167.89.123.54       80  192.168.1.173     57140       4   507     1
7     0.030      UDP  192.168.1.124    41809 24.156.242.137     57274       1   131     1
8     0.030      UDP 24.156.242.137    57274  192.168.1.124     41809       1   317     1
9     0.121      TCP  192.168.1.170    50277 216.58.196.202       443       9  1298     1
10    0.121      TCP 216.58.196.202      443  192.168.1.170     50277      20 18557     1

https://docs.google.com/spreadsheets/d/1mzH6KOQQ87-sKyMkVQ8KS7ELKE6K28cZMkQcMub5lRc/edit?usp=sharing

现在,对于每个5分钟滚动窗口的每个源IP,我需要计算唯一的目标IP。所以,我的第一个窗口是00:00:00到00:05:00,我的第二个窗口是00:01:00到00:06:00(聚合超过5分钟窗口,时间步长为1分钟)。谁能建议我一个方法来实现这个目标?

所需的输出是:

   src_ip         Timestamp           dest_ip_list                                  
 1 103.8.44.5     2018-02-13 12:42:00 192.168.1.162               
 2 167.89.123.54  2018-02-13 12:40:00 192.168.1.173               
 3 192.168.1.124  2018-02-13 12:40:00 216.58.221.35,24.156.242.137
 4 192.168.1.124  2018-02-13 12:42:00 24.156.242.137              
 5 192.168.1.162  2018-02-13 12:42:00 103.8.44.5                  
 6 192.168.1.170  2018-02-13 12:40:00 216.58.196.202              
 7 192.168.1.173  2018-02-13 12:40:00 167.89.123.54               
 8 216.58.196.202 2018-02-13 12:40:00 192.168.1.170               
 9 216.58.221.35  2018-02-13 12:40:00 192.168.1.124               
10 24.156.242.137 2018-02-13 12:42:00 192.168.1.124 

1 个答案:

答案 0 :(得分:2)

您可以尝试这样的事情:

library(dplyr)

df %>%
  mutate(Timestamp = as.POSIXct(df$Timestamp, format= "%Y-%m-%d %H:%M:%S", tz="GMT")) %>%
  mutate(rolling_window = cut(Timestamp, '5 min')) %>%
  group_by(src_ip, rolling_window) %>%
  summarise(unique_dest_ip = paste(unique(dest_ip), collapse=","))

输出是:

  src_ip         rolling_window      unique_dest_ip              
  <chr>          <fct>               <chr>                       
1 103.8.44.5     2018-02-13 12:40:00 192.168.1.162               
2 167.89.123.54  2018-02-13 12:40:00 192.168.1.173               
3 192.168.1.124  2018-02-13 12:40:00 216.58.221.35,24.156.242.137
4 192.168.1.162  2018-02-13 12:40:00 103.8.44.5                  
5 192.168.1.170  2018-02-13 12:40:00 216.58.196.202              
6 192.168.1.173  2018-02-13 12:40:00 167.89.123.54               
7 216.58.196.202 2018-02-13 12:40:00 192.168.1.170               
8 216.58.221.35  2018-02-13 12:40:00 192.168.1.124               
9 24.156.242.137 2018-02-13 12:40:00 192.168.1.124

示例数据:

df <- structure(list(Raw_log = c("2018-02-13 12:42:08.000 0.030 UDP 192.168.1.162:52085 -> 103.8.44.5:53 1 65 1", 
"2018-02-13 12:42:08.000 0.030 UDP 103.8.44.5:53 -> 192.168.1.162:52085 1 105 1", 
"2018-02-13 12:40:08.000 0.150 TCP 192.168.1.124:52812 -> 216.58.221.35:443 10 709 1", 
"2018-02-13 12:40:08.000 0.150 TCP 216.58.221.35:443 -> 192.168.1.124:52812 10 5130 1", 
"2018-02-13 12:40:32.000 0.126 TCP 192.168.1.173:57140 -> 167.89.123.54:80 5 988 1", 
"2018-02-13 12:40:32.000 0.126 TCP 167.89.123.54:80 -> 192.168.1.173:57140 4 507 1", 
"2018-02-13 12:42:08.000 0.030 UDP 192.168.1.124:41809 -> 24.156.242.137:57274 1 131 1", 
"2018-02-13 12:42:08.000 0.030 UDP 24.156.242.137:57274 -> 192.168.1.124:41809 1 317 1", 
"2018-02-13 12:40:37.000 0.121 TCP 192.168.1.170:50277 -> 216.58.196.202:443 9 1298 1", 
"2018-02-13 12:40:37.000 0.121 TCP 216.58.196.202:443 -> 192.168.1.170:50277 20 18557 1"
), Timestamp = c("2018-02-13 12:42:08", "2018-02-13 12:42:08", 
"2018-02-13 12:40:08", "2018-02-13 12:40:08", "2018-02-13 12:40:32", 
"2018-02-13 12:40:32", "2018-02-13 12:42:08", "2018-02-13 12:42:08", 
"2018-02-13 12:40:37", "2018-02-13 12:40:37"), Duration = c(0.03, 
0.03, 0.15, 0.15, 0.126, 0.126, 0.03, 0.03, 0.121, 0.121), Protocol = c("UDP", 
"UDP", "TCP", "TCP", "TCP", "TCP", "UDP", "UDP", "TCP", "TCP"
), src_ip = c("192.168.1.162", "103.8.44.5", "192.168.1.124", 
"216.58.221.35", "192.168.1.173", "167.89.123.54", "192.168.1.124", 
"24.156.242.137", "192.168.1.170", "216.58.196.202"), src_port = c(52085L, 
53L, 52812L, 443L, 57140L, 80L, 41809L, 57274L, 50277L, 443L), 
    dest_ip = c("103.8.44.5", "192.168.1.162", "216.58.221.35", 
    "192.168.1.124", "167.89.123.54", "192.168.1.173", "24.156.242.137", 
    "192.168.1.124", "216.58.196.202", "192.168.1.170"), dest_port = c(53L, 
    52085L, 443L, 52812L, 80L, 57140L, 57274L, 41809L, 443L, 
    50277L), Packets = c(1L, 1L, 10L, 10L, 5L, 4L, 1L, 1L, 9L, 
    20L), Bytes = c(65L, 105L, 709L, 5130L, 988L, 507L, 131L, 
    317L, 1298L, 18557L), Flows = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L)), .Names = c("Raw_log", "Timestamp", "Duration", 
"Protocol", "src_ip", "src_port", "dest_ip", "dest_port", "Packets", 
"Bytes", "Flows"), class = "data.frame", row.names = c(NA, -10L
))



编辑:如果您想让每个Timestamp在接下来的5分钟内连续{5}进行滚动连接,请按照此答案的评论进行描述)

dest_ip

更新代码的输出是:

library(dplyr)
library(zoo)

df <- df %>%
  mutate(Timestamp = as.POSIXct(df$Timestamp, format= "%Y-%m-%d %H:%M:%S", tz="GMT")) %>%
  mutate(Timestamp_HM = as.POSIXct(cut(Timestamp, '1 min'), format= "%Y-%m-%d %H:%M:%S", tz="GMT"))

timestamp_seq = as.POSIXct(seq(min(df$Timestamp_HM), max(df$Timestamp_HM), by = "mins"),
                           format= "%Y-%m-%d %H:%M:%S", tz="GMT")

merge(timestamp_seq, unique(df$src_ip)) %>%
  left_join(df, by=c("y"="src_ip", "x"="Timestamp_HM")) %>%
  select(x, y, dest_ip) %>%
  data.frame(stringsAsFactors=F) %>%
  arrange(y, x) %>%
  group_by(y) %>%
  mutate(dest_ip_list = rollapply(dest_ip, 5, FUN=function(z) paste(z[!is.na(z)], collapse=","), fill=NA, align="left", partial=T)) %>%
  filter(!is.na(dest_ip)) %>%
  select(y, x, dest_ip_list) %>%
  `colnames<-`(c('src_ip', 'Timestamp', 'dest_ip_list'))