如何使用pyparsing模块解析具有多种格式日志的日志文件。以下是我正在使用的代码。
# -*- coding: utf-8 -*-
"""
"""
import pandas as pd
from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex
from time import strftime
class Parser(object):
def __init__(self):
ints = Word(nums)
# priority
# priority = Suppress("<") + ints + Suppress(">")
# timestamp
month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
day = ints
hour = Combine(ints + ":" + ints + ":" + ints)
timestamp = month + day + hour
# hostname
hostname = Word(alphas + nums + "_" + "-" + ".")
# appname
appname = Word(alphas + "/" + "-" + "_" + "." + "(" + ")") + (Suppress("[") + ints + Suppress("]")) | (Word(alphas + "/" + "-" + "_" + ".") + Word (":"))
# message
message = Regex(".*")
# pattern build
self.__pattern = timestamp + hostname + appname + message
def parse(self, line):
parsed = self.__pattern.parseString(line)
payload = {}
#payload["priority"] = parsed[0]
payload["timestamp"] = strftime("%Y-%m-%d %H:%M:%S")
payload["hostname"] = parsed[3]
payload["appname"] = parsed[4]
payload["pid"] = parsed[5]
payload["message"] = parsed[6]
return payload
def main():
parser = Parser()
with open('./messages.log') as syslogFile:
list1 = []
for line in syslogFile:
fields = parser.parse(line)
list1.append(fields)
return list1
if __name__ == "__main__":
main()
以下是需要解析不同日志的示例:
Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses)
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
Mar 7 21:23:22 avas dccifd[6191]: missing message body
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
Mar 8 16:05:26 avas arpwatch: listening on eth0
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
Mar 8 15:18:40 avas: last message repeated 11 times
请建议我该怎么办?
答案 0 :(得分:0)
为了处理这个新行,我使用pyparsing Optional类将appname部分标记为可选,并拆分尾随':'。在下面的代码中,我还做了一些调整,一些用于解析时数据转换的解析操作,以及一些结果名称,以简化在parse()方法中创建生成的dict。
from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex, Optional
from datetime import datetime
class Parser(object):
# log lines don't include the year, but if we don't provide one, datetime.strptime will assume 1900
ASSUMED_YEAR = '2016'
def __init__(self):
ints = Word(nums)
# priority
# priority = Suppress("<") + ints + Suppress(">")
# timestamp
month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
day = ints
hour = Combine(ints + ":" + ints + ":" + ints)
timestamp = month + day + hour
# a parse action will convert this timestamp to a datetime
timestamp.setParseAction(lambda t: datetime.strptime(Parser.ASSUMED_YEAR + ' ' + ' '.join(t), '%Y %b %d %H:%M:%S'))
# hostname
hostname = Word(alphas + nums + "_-.")
# appname
appname = Word(alphas + "/-_.()")("appname") + (Suppress("[") + ints("pid") + Suppress("]")) | (Word(alphas + "/-_.")("appname"))
appname.setName("appname")
# message
message = Regex(".*")
# pattern build
# (add results names to make it easier to access parsed fields)
self._pattern = timestamp("timestamp") + hostname("hostname") + Optional(appname) + Suppress(':') + message("message")
def parse(self, line):
parsed = self._pattern.parseString(line)
# fill in keys that might not have been found in the input string
# (this could have been done in a parse action too, then this method would
# have just been a two-liner)
for key in 'appname pid'.split():
if key not in parsed:
parsed[key] = ''
return parsed.asDict()
使用runTests()根据特定测试输入测试解析器:
pattern = Parser()._pattern
tests = """\
Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses)
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
Mar 7 21:23:22 avas dccifd[6191]: missing message body
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
Mar 8 16:05:26 avas arpwatch: listening on eth0
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
Mar 8 15:18:40 avas: last message repeated 11 times"""
pattern.runTests(tests)
给出:
Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND
[datetime.datetime(2016, 3, 7, 4, 2, 16), 'avas', 'clamd', '11165', '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND'
- pid: '11165'
- timestamp: datetime.datetime(2016, 3, 7, 4, 2, 16)
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND
[datetime.datetime(2016, 3, 7, 4, 5, 55), 'avas', 'clamd', '11240', '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND'
- pid: '11240'
- timestamp: datetime.datetime(2016, 3, 7, 4, 5, 55)
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
[datetime.datetime(2016, 3, 7, 9, 0, 51), 'avas', 'clamd', '27173', 'SelfCheck: Database status OK.']
- appname: 'clamd'
- hostname: 'avas'
- message: 'SelfCheck: Database status OK.'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 9, 0, 51)
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses)
[datetime.datetime(2016, 3, 7, 5, 59, 2), 'avas', 'clamd', '27173', 'Database correctly reloaded (20400 viruses)']
- appname: 'clamd'
- hostname: 'avas'
- message: 'Database correctly reloaded (20400 viruses)'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 5, 59, 2)
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
[datetime.datetime(2016, 3, 7, 11, 14, 35), 'avas', 'dccd', '13284', '21 requests/sec are too many from anonymous 205.201.1.56,2246']
- appname: 'dccd'
- hostname: 'avas'
- message: '21 requests/sec are too many from anonymous 205.201.1.56,2246'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 7, 11, 14, 35)
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
[datetime.datetime(2016, 3, 8, 0, 22, 57), 'avas', 'dccifd', '9933', 'write(MTA socket,4): Broken pipe']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'write(MTA socket,4): Broken pipe'
- pid: '9933'
- timestamp: datetime.datetime(2016, 3, 8, 0, 22, 57)
Mar 7 21:23:22 avas dccifd[6191]: missing message body
[datetime.datetime(2016, 3, 7, 21, 23, 22), 'avas', 'dccifd', '6191', 'missing message body']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'missing message body'
- pid: '6191'
- timestamp: datetime.datetime(2016, 3, 7, 21, 23, 22)
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
[datetime.datetime(2016, 3, 9, 16, 5, 17), 'avas', 'named', '12045', 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53'
- pid: '12045'
- timestamp: datetime.datetime(2016, 3, 9, 16, 5, 17)
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
[datetime.datetime(2016, 3, 10, 0, 38, 16), 'avas', 'dccifd', '23069', 'continue not asking DCC 17 seconds after failure']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'continue not asking DCC 17 seconds after failure'
- pid: '23069'
- timestamp: datetime.datetime(2016, 3, 10, 0, 38, 16)
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
[datetime.datetime(2016, 3, 10, 9, 42, 11), 'avas', 'named', 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT'
- timestamp: datetime.datetime(2016, 3, 10, 9, 42, 11)
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
[datetime.datetime(2016, 3, 9, 3, 48, 7), 'avas', 'dccd', '145', 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`']
- appname: 'dccd'
- hostname: 'avas'
- message: 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`'
- pid: '145'
- timestamp: datetime.datetime(2016, 3, 9, 3, 48, 7)
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
[datetime.datetime(2016, 3, 9, 11, 58, 18), 'avas', 'kernel', 'i810_audio: Connection 0 with codec id 2']
- appname: 'kernel'
- hostname: 'avas'
- message: 'i810_audio: Connection 0 with codec id 2'
- timestamp: datetime.datetime(2016, 3, 9, 11, 58, 18)
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
[datetime.datetime(2016, 3, 9, 19, 41, 13), 'avas', 'dccd', '3004', '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577']
- appname: 'dccd'
- hostname: 'avas'
- message: '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577'
- pid: '3004'
- timestamp: datetime.datetime(2016, 3, 9, 19, 41, 13)
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
[datetime.datetime(2016, 3, 8, 9, 1, 7), 'avas', 'sshd(pam_unix)', '21839', 'session opened for user tom by (uid=35567)']
- appname: 'sshd(pam_unix)'
- hostname: 'avas'
- message: 'session opened for user tom by (uid=35567)'
- pid: '21839'
- timestamp: datetime.datetime(2016, 3, 8, 9, 1, 7)
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
[datetime.datetime(2016, 3, 8, 3, 52, 4), 'avas', 'dccd', '13284', '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window']
- appname: 'dccd'
- hostname: 'avas'
- message: '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 8, 3, 52, 4)
Mar 8 16:05:26 avas arpwatch: listening on eth0
[datetime.datetime(2016, 3, 8, 16, 5, 26), 'avas', 'arpwatch', 'listening on eth0']
- appname: 'arpwatch'
- hostname: 'avas'
- message: 'listening on eth0'
- timestamp: datetime.datetime(2016, 3, 8, 16, 5, 26)
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
[datetime.datetime(2016, 3, 10, 10, 0, 6), 'avas', 'named', '6986', 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 6)
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
[datetime.datetime(2016, 3, 10, 10, 0, 10), 'avas', 'named', '6986', 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 10)
Mar 8 15:18:40 avas: last message repeated 11 times
[datetime.datetime(2016, 3, 8, 15, 18, 40), 'avas', 'last message repeated 11 times']
- hostname: 'avas'
- message: 'last message repeated 11 times'
- timestamp: datetime.datetime(2016, 3, 8, 15, 18, 40)
或者使用Parser类的parse()方法:
from pprint import pprint
for t in tests.splitlines():
pprint(Parser().parse(t))
print()
给出:
{'appname': 'clamd',
'hostname': 'avas',
'message': '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: '
'Worm.Mydoom.F FOUND ',
'pid': '11165',
'timestamp': datetime.datetime(2016, 3, 7, 4, 2, 16)}
{'appname': 'clamd',
'hostname': 'avas',
'message': '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: '
'Worm.SomeFool.Gen-1 FOUND ',
'pid': '11240',
'timestamp': datetime.datetime(2016, 3, 7, 4, 5, 55)}
{'appname': 'clamd',
'hostname': 'avas',
'message': 'SelfCheck: Database status OK.',
'pid': '27173',
'timestamp': datetime.datetime(2016, 3, 7, 9, 0, 51)}
{'appname': 'clamd',
'hostname': 'avas',
'message': 'Database correctly reloaded (20400 viruses) ',
'pid': '27173',
'timestamp': datetime.datetime(2016, 3, 7, 5, 59, 2)}
{'appname': 'dccd',
'hostname': 'avas',
'message': '21 requests/sec are too many from anonymous 205.201.1.56,2246',
'pid': '13284',
'timestamp': datetime.datetime(2016, 3, 7, 11, 14, 35)}
{'appname': 'dccifd',
'hostname': 'avas',
'message': 'write(MTA socket,4): Broken pipe',
'pid': '9933',
'timestamp': datetime.datetime(2016, 3, 8, 0, 22, 57)}
{'appname': 'dccifd',
'hostname': 'avas',
'message': 'missing message body',
'pid': '6191',
'timestamp': datetime.datetime(2016, 3, 7, 21, 23, 22)}
{'appname': 'named',
'hostname': 'avas',
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
'10.0.0.253#53',
'pid': '12045',
'timestamp': datetime.datetime(2016, 3, 9, 16, 5, 17)}
{'appname': 'dccifd',
'hostname': 'avas',
'message': 'continue not asking DCC 17 seconds after failure',
'pid': '23069',
'timestamp': datetime.datetime(2016, 3, 10, 0, 38, 16)}
{'appname': 'named',
'hostname': 'avas',
'message': 'client 127.0.0.1#55524: query: '
'23.68.27.142.sa-trusted.bondedsender.org IN TXT',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 10, 9, 42, 11)}
{'appname': 'dccd',
'hostname': 'avas',
'message': 'automatic dbclean; starting `dbclean -DPq -i 1189 -L '
'info,local5.notice -L error,local5.err`',
'pid': '145',
'timestamp': datetime.datetime(2016, 3, 9, 3, 48, 7)}
{'appname': 'kernel',
'hostname': 'avas',
'message': 'i810_audio: Connection 0 with codec id 2',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 9, 11, 58, 18)}
{'appname': 'dccd',
'hostname': 'avas',
'message': '"packet length 44 too small for REPORT" sent to client 1 at '
'194.63.250.215,47577',
'pid': '3004',
'timestamp': datetime.datetime(2016, 3, 9, 19, 41, 13)}
{'appname': 'sshd(pam_unix)',
'hostname': 'avas',
'message': 'session opened for user tom by (uid=35567)',
'pid': '21839',
'timestamp': datetime.datetime(2016, 3, 8, 9, 1, 7)}
{'appname': 'dccd',
'hostname': 'avas',
'message': '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window',
'pid': '13284',
'timestamp': datetime.datetime(2016, 3, 8, 3, 52, 4)}
{'appname': 'arpwatch',
'hostname': 'avas',
'message': 'listening on eth0',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 8, 16, 5, 26)}
{'appname': 'named',
'hostname': 'avas',
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
'192.75.26.21#53',
'pid': '6986',
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 6)}
{'appname': 'named',
'hostname': 'avas',
'message': 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX',
'pid': '6986',
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 10)}
{'appname': '',
'hostname': 'avas',
'message': 'last message repeated 11 times',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 8, 15, 18, 40)}