我有一个数据帧,它基本上是我已经解析并制作成综合数据帧的日志数据
示例输入:
Raw_log Timestamp
1 2018-02-13 12:42:08.000 0.030 UDP 192.168.1.162:52085 -> 103.8.44.5:53 1 65 1 2018-02-13 12:42:08
2 2018-02-13 12:42:08.000 0.030 UDP 103.8.44.5:53 -> 192.168.1.162:52085 1 105 1 2018-02-13 12:42:08
3 2018-02-13 12:40:08.000 0.150 TCP 192.168.1.124:52812 -> 216.58.221.35:443 10 709 1 2018-02-13 12:40:08
4 2018-02-13 12:40:08.000 0.150 TCP 216.58.221.35:443 -> 192.168.1.124:52812 10 5130 1 2018-02-13 12:40:08
5 2018-02-13 12:40:32.000 0.126 TCP 192.168.1.173:57140 -> 167.89.123.54:80 5 988 1 2018-02-13 12:40:32
6 2018-02-13 12:40:32.000 0.126 TCP 167.89.123.54:80 -> 192.168.1.173:57140 4 507 1 2018-02-13 12:40:32
7 2018-02-13 12:42:08.000 0.030 UDP 192.168.1.124:41809 -> 24.156.242.137:57274 1 131 1 2018-02-13 12:42:08
8 2018-02-13 12:42:08.000 0.030 UDP 24.156.242.137:57274 -> 192.168.1.124:41809 1 317 1 2018-02-13 12:42:08
9 2018-02-13 12:40:37.000 0.121 TCP 192.168.1.170:50277 -> 216.58.196.202:443 9 1298 1 2018-02-13 12:40:37
10 2018-02-13 12:40:37.000 0.121 TCP 216.58.196.202:443 -> 192.168.1.170:50277 20 18557 1 2018-02-13 12:40:37
Duration Protocol src_ip src_port dest_ip dest_port Packets Bytes Flows
1 0.030 UDP 192.168.1.162 52085 103.8.44.5 53 1 65 1
2 0.030 UDP 103.8.44.5 53 192.168.1.162 52085 1 105 1
3 0.150 TCP 192.168.1.124 52812 216.58.221.35 443 10 709 1
4 0.150 TCP 216.58.221.35 443 192.168.1.124 52812 10 5130 1
5 0.126 TCP 192.168.1.173 57140 167.89.123.54 80 5 988 1
6 0.126 TCP 167.89.123.54 80 192.168.1.173 57140 4 507 1
7 0.030 UDP 192.168.1.124 41809 24.156.242.137 57274 1 131 1
8 0.030 UDP 24.156.242.137 57274 192.168.1.124 41809 1 317 1
9 0.121 TCP 192.168.1.170 50277 216.58.196.202 443 9 1298 1
10 0.121 TCP 216.58.196.202 443 192.168.1.170 50277 20 18557 1
https://docs.google.com/spreadsheets/d/1mzH6KOQQ87-sKyMkVQ8KS7ELKE6K28cZMkQcMub5lRc/edit?usp=sharing
现在,对于每个5分钟滚动窗口的每个源IP,我需要计算唯一的目标IP。所以,我的第一个窗口是00:00:00到00:05:00,我的第二个窗口是00:01:00到00:06:00(聚合超过5分钟窗口,时间步长为1分钟)。谁能建议我一个方法来实现这个目标?
所需的输出是:
src_ip Timestamp dest_ip_list
1 103.8.44.5 2018-02-13 12:42:00 192.168.1.162
2 167.89.123.54 2018-02-13 12:40:00 192.168.1.173
3 192.168.1.124 2018-02-13 12:40:00 216.58.221.35,24.156.242.137
4 192.168.1.124 2018-02-13 12:42:00 24.156.242.137
5 192.168.1.162 2018-02-13 12:42:00 103.8.44.5
6 192.168.1.170 2018-02-13 12:40:00 216.58.196.202
7 192.168.1.173 2018-02-13 12:40:00 167.89.123.54
8 216.58.196.202 2018-02-13 12:40:00 192.168.1.170
9 216.58.221.35 2018-02-13 12:40:00 192.168.1.124
10 24.156.242.137 2018-02-13 12:42:00 192.168.1.124
答案 0 :(得分:2)
您可以尝试这样的事情:
library(dplyr)
df %>%
mutate(Timestamp = as.POSIXct(df$Timestamp, format= "%Y-%m-%d %H:%M:%S", tz="GMT")) %>%
mutate(rolling_window = cut(Timestamp, '5 min')) %>%
group_by(src_ip, rolling_window) %>%
summarise(unique_dest_ip = paste(unique(dest_ip), collapse=","))
输出是:
src_ip rolling_window unique_dest_ip
<chr> <fct> <chr>
1 103.8.44.5 2018-02-13 12:40:00 192.168.1.162
2 167.89.123.54 2018-02-13 12:40:00 192.168.1.173
3 192.168.1.124 2018-02-13 12:40:00 216.58.221.35,24.156.242.137
4 192.168.1.162 2018-02-13 12:40:00 103.8.44.5
5 192.168.1.170 2018-02-13 12:40:00 216.58.196.202
6 192.168.1.173 2018-02-13 12:40:00 167.89.123.54
7 216.58.196.202 2018-02-13 12:40:00 192.168.1.170
8 216.58.221.35 2018-02-13 12:40:00 192.168.1.124
9 24.156.242.137 2018-02-13 12:40:00 192.168.1.124
示例数据:
df <- structure(list(Raw_log = c("2018-02-13 12:42:08.000 0.030 UDP 192.168.1.162:52085 -> 103.8.44.5:53 1 65 1",
"2018-02-13 12:42:08.000 0.030 UDP 103.8.44.5:53 -> 192.168.1.162:52085 1 105 1",
"2018-02-13 12:40:08.000 0.150 TCP 192.168.1.124:52812 -> 216.58.221.35:443 10 709 1",
"2018-02-13 12:40:08.000 0.150 TCP 216.58.221.35:443 -> 192.168.1.124:52812 10 5130 1",
"2018-02-13 12:40:32.000 0.126 TCP 192.168.1.173:57140 -> 167.89.123.54:80 5 988 1",
"2018-02-13 12:40:32.000 0.126 TCP 167.89.123.54:80 -> 192.168.1.173:57140 4 507 1",
"2018-02-13 12:42:08.000 0.030 UDP 192.168.1.124:41809 -> 24.156.242.137:57274 1 131 1",
"2018-02-13 12:42:08.000 0.030 UDP 24.156.242.137:57274 -> 192.168.1.124:41809 1 317 1",
"2018-02-13 12:40:37.000 0.121 TCP 192.168.1.170:50277 -> 216.58.196.202:443 9 1298 1",
"2018-02-13 12:40:37.000 0.121 TCP 216.58.196.202:443 -> 192.168.1.170:50277 20 18557 1"
), Timestamp = c("2018-02-13 12:42:08", "2018-02-13 12:42:08",
"2018-02-13 12:40:08", "2018-02-13 12:40:08", "2018-02-13 12:40:32",
"2018-02-13 12:40:32", "2018-02-13 12:42:08", "2018-02-13 12:42:08",
"2018-02-13 12:40:37", "2018-02-13 12:40:37"), Duration = c(0.03,
0.03, 0.15, 0.15, 0.126, 0.126, 0.03, 0.03, 0.121, 0.121), Protocol = c("UDP",
"UDP", "TCP", "TCP", "TCP", "TCP", "UDP", "UDP", "TCP", "TCP"
), src_ip = c("192.168.1.162", "103.8.44.5", "192.168.1.124",
"216.58.221.35", "192.168.1.173", "167.89.123.54", "192.168.1.124",
"24.156.242.137", "192.168.1.170", "216.58.196.202"), src_port = c(52085L,
53L, 52812L, 443L, 57140L, 80L, 41809L, 57274L, 50277L, 443L),
dest_ip = c("103.8.44.5", "192.168.1.162", "216.58.221.35",
"192.168.1.124", "167.89.123.54", "192.168.1.173", "24.156.242.137",
"192.168.1.124", "216.58.196.202", "192.168.1.170"), dest_port = c(53L,
52085L, 443L, 52812L, 80L, 57140L, 57274L, 41809L, 443L,
50277L), Packets = c(1L, 1L, 10L, 10L, 5L, 4L, 1L, 1L, 9L,
20L), Bytes = c(65L, 105L, 709L, 5130L, 988L, 507L, 131L,
317L, 1298L, 18557L), Flows = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L)), .Names = c("Raw_log", "Timestamp", "Duration",
"Protocol", "src_ip", "src_port", "dest_ip", "dest_port", "Packets",
"Bytes", "Flows"), class = "data.frame", row.names = c(NA, -10L
))
(编辑:如果您想让每个Timestamp
在接下来的5分钟内连续{5}进行滚动连接,请按照此答案的评论进行描述)
dest_ip
更新代码的输出是:
library(dplyr)
library(zoo)
df <- df %>%
mutate(Timestamp = as.POSIXct(df$Timestamp, format= "%Y-%m-%d %H:%M:%S", tz="GMT")) %>%
mutate(Timestamp_HM = as.POSIXct(cut(Timestamp, '1 min'), format= "%Y-%m-%d %H:%M:%S", tz="GMT"))
timestamp_seq = as.POSIXct(seq(min(df$Timestamp_HM), max(df$Timestamp_HM), by = "mins"),
format= "%Y-%m-%d %H:%M:%S", tz="GMT")
merge(timestamp_seq, unique(df$src_ip)) %>%
left_join(df, by=c("y"="src_ip", "x"="Timestamp_HM")) %>%
select(x, y, dest_ip) %>%
data.frame(stringsAsFactors=F) %>%
arrange(y, x) %>%
group_by(y) %>%
mutate(dest_ip_list = rollapply(dest_ip, 5, FUN=function(z) paste(z[!is.na(z)], collapse=","), fill=NA, align="left", partial=T)) %>%
filter(!is.na(dest_ip)) %>%
select(y, x, dest_ip_list) %>%
`colnames<-`(c('src_ip', 'Timestamp', 'dest_ip_list'))