我有一个具有此类数据的大型csv文件
192.168.107.87,4662,69.192.30.179,80,"other"
192.168.107.87,4662,69.192.30.179,80,"infection"
192.168.177.85,4662,69.192.30.179,80,"infection"
192.168.177.85,4662,69.192.30.179,80,"other"
我已经能够消除真正的重复,但是我需要取出也标记为“感染”的“其他”,不确定该怎么做? 下面是我的代码,以删除重复的和重复的连接,以及带有除我以外的三个消息的消息,我只是无法弄清楚现在如何删除具有其他(也有感染或cnc)的消息 基本上从上面的示例中删除了当我删除重复的“其他”时与它们中的两个“其他”的连接,我需要保持跟踪是“感染”还是“ cnc”,也只是每个
with open(r'alerts.csv','r') as in_file, open('alertsfix.csv','w') as out_file:
seen = set() # set for fast O(1) amortized lookup
for line in in_file:
if line in seen: continue # skip duplicate
seen.add(line)
out_file.write(line)
in_file.close()
out_file.close()
'''
writes new file eliminates cross connections sorce and dest
'''
s1='"other"'
s2='"infection"'
s3='"cnc"'
with open('alertsfix.csv','r') as in_file, open('alertsfixmore.csv','w') as out_file:
seen = set()
for line in in_file:
lines = line.strip()
if len(lines) > 0:
src_ip, src_port, dst_ip, dst_port, msg = lines.split(',')
src = '{}:{}'.format(src_ip, src_port)
dst = '{}:{}'.format(dst_ip, dst_port)
key = frozenset([
frozenset([src, dst]),
msg,
])
if key not in seen:
seen.add(key) # we add 'key' to the set
s4 = msg
if s4 in (s1,s2,s3): # eliminate any other types
out_file.write(line) # we write 'line ot' to the new file
in_file.close()
out_file.close()
答案 0 :(得分:0)
对索引0上的行进行排序;然后按索引0分组;为每个组过滤掉所有"other"
;检查剩余的内容并计算"infection"
和"cnc"
的数量;将剩余的行添加到新容器中。
import io, csv, itertools
f = io.StringIO('''192.168.107.87,4662,69.192.30.179,80,"other"
192.168.107.87,4662,69.192.30.179,80,"infection"
192.168.177.85,4662,69.192.30.179,80,"infection"
192.168.177.85,4662,69.192.30.179,80,"other"
192.168.177.111,4662,69.192.30.179,80,"cnc"
192.168.177.111,4662,69.192.30.179,80,"other"
192.168.177.222,4662,69.192.30.179,80,"infection"
192.168.177.222,4662,69.192.30.179,80,"cnc"
192.168.177.222,4662,69.192.30.179,80,"other"''')
reader = csv.reader(f)
data = list(reader)
data.sort(key=lambda item: item[0])
groups = itertools.groupby(data, lambda item: item[0])
newdata = []
infection, cnc = 0, 0
for key, group in groups:
group = [row for row in group if row[-1] != "other"]
infection += sum(row[-1] == "infection" for row in group)
cnc += sum(row[-1] == "cnc" for row in group)
newdata.extend(group)
In [18]: cnc
Out[18]: 2
In [19]: infection
Out[19]: 3
In [20]: newdata
Out[20]:
[['192.168.107.87', '4662', '69.192.30.179', '80', 'infection'],
['192.168.177.111', '4662', '69.192.30.179', '80', 'cnc'],
['192.168.177.222', '4662', '69.192.30.179', '80', 'infection'],
['192.168.177.222', '4662', '69.192.30.179', '80', 'cnc'],
['192.168.177.85', '4662', '69.192.30.179', '80', 'infection']]
根据您实际要执行的操作,您可能需要对多个列进行排序和分组-示例数据看起来也可以与lambda item: item[:-1]
键一起使用。
答案 1 :(得分:0)
您还可以保留所有发生的事件及其计数的信息:
import pandas as pd
from io import StringIO
In []: f = StringIO('''192.168.107.87,4662,69.192.30.179,80,"other"
...: 192.168.107.87,4662,69.192.30.179,80,"infection"
...: 192.168.177.85,4662,69.192.30.179,80,"infection"
...: 192.168.177.85,4662,69.192.30.179,80,"other"
...: 192.168.177.111,4662,69.192.30.179,80,"cnc"
...: 192.168.177.111,4662,69.192.30.179,80,"other"
...: 192.168.177.222,4662,69.192.30.179,80,"infection"
...: 192.168.177.222,4662,69.192.30.179,80,"cnc"
...: 192.168.177.222,4662,69.192.30.179,80,"other"''')
In []: df = pd.read_csv(f, names=['IP_1', 'port_1', 'IP_2', 'port_2', 'event'])
In []: df
Out[]:
IP_1 port_1 IP_2 port_2 event
0 192.168.107.87 4662 69.192.30.179 80 other
1 192.168.107.87 4662 69.192.30.179 80 infection
2 192.168.177.85 4662 69.192.30.179 80 infection
3 192.168.177.85 4662 69.192.30.179 80 other
4 192.168.177.111 4662 69.192.30.179 80 cnc
5 192.168.177.111 4662 69.192.30.179 80 other
6 192.168.177.222 4662 69.192.30.179 80 infection
7 192.168.177.222 4662 69.192.30.179 80 cnc
8 192.168.177.222 4662 69.192.30.179 80 other
In []: pd.get_dummies(df, columns=['event']).groupby(list(df.columns[:-1]), as_index=False).sum()
Out[]:
IP_1 port_1 IP_2 port_2 event_cnc event_infection event_other
0 192.168.107.87 4662 69.192.30.179 80 0 1 1
1 192.168.177.111 4662 69.192.30.179 80 1 0 1
2 192.168.177.222 4662 69.192.30.179 80 1 1 1
3 192.168.177.85 4662 69.192.30.179 80 0 1 1