我正在尝试使用以下格式从文本文件中获取一些数据:
jvm: 2011-08-29 17:09:54.438864:
MemoryStatistics: [290328680, 381288448]
moniData: 2011-08-29 17:09:54.438864:
Depth: [0]
RecordsSent: [1]
rdoutData: 2011-08-29 17:09:54.438864:
Depth: [0]
RecordsSent: [0]
rdoutReq: 2011-08-29 17:09:54.438864:
TotalRecordsReceived: 132
RecordsReceived: [132]
BytesReceived: [8184]
sender: 2011-08-29 17:09:54.438864:
NumReadoutRequestsReceived: 178
NumHitsReceived: 2663
NumReadoutsSent: 1
NumHitsCached: 0
NumHitsQueued: 310
NumReadoutRequestsQueued: 0
snData: 2011-08-29 17:09:54.438864:
Depth: [0]
RecordsSent: [61]
stringHit: 2011-08-29 17:09:54.438864:
Depth: [8]
RecordsSent: [3026]
stringhub: 2011-08-29 17:09:54.438864:
TimeOfLastHitOutputFromHKN1: 207977962295545677
NumberOfActiveAndTotalChannels: [60, 60]
NumberOfActiveChannels: 60
TimeOfLastHitInputToHKN1: 207977964479700660
HitRateLC: 0.0
HitRate: 0.0
TotalLBMOverflows: 1091
system: 2011-08-29 17:09:54.438864:
LoadAverage: [0.0, 0.02, 0.35999999999999999]
NetworkIO: {'lo_tx_errs': 0, 'eth1_rx_fifo': 0, 'eth2_rx_packets': 0, 'eth1_tx_compressed': 0, 'eth2_tx_compressed': 0, 'eth0_tx_fifo': 0, 'eth1_tx_packets': 0, 'lo_tx_compressed': 0, 'eth1_rx_compressed': 0, 'lo_rx_errs': 0, 'eth1_tx_fifo': 0, 'lo_tx_fifo': 0, 'eth0_tx_errs': 0, 'eth0_rx_multicast': 0, 'eth0_tx_carrier': 0, 'eth3_rx_compressed': 0, 'eth3_tx_drop': 0, 'lo_tx_drop': 0, 'eth2_rx_drop': 0, 'eth1_tx_drop': 0, 'eth3_rx_bytes': 0, 'eth3_tx_packets': 0, 'lo_rx_bytes': 8270472, 'eth2_rx_errs': 0, 'eth3_tx_errs': 0, 'eth0_rx_errs': 0, 'eth2_tx_errs': 0, 'lo_rx_packets': 71359, 'eth2_rx_compressed': 0, 'eth3_rx_packets': 0, 'eth0_tx_drop': 0, 'eth0_rx_frame': 0, 'eth1_tx_bytes': 0, 'eth1_rx_multicast': 0, 'eth1_rx_packets': 0, 'eth2_tx_fifo': 0, 'eth1_tx_errs': 0, 'eth2_tx_bytes': 0, 'eth3_rx_frame': 0, 'eth2_rx_frame': 0, 'eth1_rx_bytes': 0, 'eth0_rx_drop': 0, 'eth3_rx_drop': 0, 'eth1_rx_frame': 0, 'eth2_tx_packets': 0, 'eth0_tx_bytes': 389183382674, 'eth3_rx_errs': 0, 'eth0_rx_bytes': 141781372747, 'eth3_tx_compressed': 0, 'eth2_rx_fifo': 0, 'lo_tx_bytes': 8270472, 'eth1_rx_errs': 0, 'eth1_tx_carrier': 0, 'eth0_rx_packets': 478007025, 'lo_rx_drop': 0, 'eth0_tx_compressed': 0, 'eth0_rx_fifo': 0, 'eth3_tx_colls': 0, 'eth0_tx_colls': 0, 'lo_tx_packets': 71359, 'eth2_rx_multicast': 0, 'eth2_tx_colls': 0, 'eth3_tx_fifo': 0, 'eth1_tx_colls': 0, 'lo_tx_carrier': 0, 'lo_rx_frame': 0, 'eth1_rx_drop': 0, 'lo_tx_colls': 0, 'eth3_tx_bytes': 0, 'lo_rx_fifo': 0, 'eth2_tx_drop': 0, 'eth3_tx_carrier': 0, 'eth3_rx_multicast': 0, 'eth0_rx_compressed': 0, 'eth2_rx_bytes': 0, 'eth2_tx_carrier': 0, 'eth0_tx_packets': 1197286889, 'lo_rx_multicast': 0, 'lo_rx_compressed': 0, 'eth3_rx_fifo': 0}
AvailableDiskSpace: {'/': 43836096, '/dev/shm': 24725760}
tcalData: 2011-08-29 17:09:54.438864:
Depth: [0]
RecordsSent: [0]
PyrateBufferManager: 2011-08-29 17:09:57.031479:
CurrentAquiredBuffers: 0
ReturnBufferCount: 4285
CurrentAquiredBytes: 0
我正在尝试获得一个时间与文件中存储的各种数量的关系图,所以基本上我想得到一些类似于这种格式的数组:
timestamp=[...,17:09:54.438864,...]
snDataDepth=[..,0,...]
snDataRecordsSend=[..., 61,...]
我得到其他人推荐使用
之类的东西f = open(file, "r").readlines()
dummy=[]
for i in f:
dummy.append(i.split("[")[1].split("]")[0])
获取我们的数字。我无法使用文本中的类别(见上文)和时间戳来正确排序数据。
提前感谢您提供任何帮助
根据要求:
已经用它来获取其他数据:
#!/usr/bin/env python
import sys, os, re
import numpy as np
import pylab as py
def main():
snrate=[]
PyrateBufferManagerCABu=[]
PyrateBufferManagerRBC=[]
PyrateBufferManagerCABy=[]
end=".moni;.log"
for i in range(1,len(sys.argv)):
file = os.path.splitext(sys.argv[i])[0]
ext = os.path.splitext(sys.argv[i])[1]
print file
if 'log' in ext:
f=open(sys.argv[i], 'rU')
dummy=[]
dummy1=[]
for line in f:
dummy += re.findall('snRate: (\d.?\d+)', line)
dummy1 += re.findall('Buffer overflow in SN record channel:', line)
snrate.append(dummy)
print ext
if 'moni' in ext:
f=open(sys.argv[i], 'rU').readlines()
#print f
timestamp=[]
dummy=[]
count=0
for line in f:
timestamp += re.findall(r'\d\d:\S+:\d\d.\d\d',line)
if 'PyrateBufferManager' in line:
PyrateBufferManagerCABu += re.findall(r'\S+-\S+-(CurrentAquiredBuffers)\:\s\d+', line[count+1])
print PyrateBufferManagerCABu
PyrateBufferManagerRBC += re.findall(r'ReturnBufferCount:\s\S+',line[count+2])
PyrateBufferManagerCABy += re.findall(r'CurrentAquiredBytes:\s\S+', line[count+3])
timestamp=list(set(timestamp))
timestamp.sort()
print PyrateBufferManagerCABu, PyrateBufferManagerRBC
print ext
答案 0 :(得分:1)
您可以将python的内置库用于正则表达式。 要获得所有时间戳,您可以执行以下操作:
import re
def main():
file=open(file)
timestamp=(re.findall(r'\S+-\S+-\d\d\s\d\d:\S+:\S+:',file.read()))
print timestamp
if __name__=='__main__':
main()
答案 1 :(得分:1)
您可以使用lepl
解析输入数据:
#!/usr/bin/env python
import ast
import fileinput
import logging
from datetime import datetime
from pprint import pprint
from string import ascii_letters, digits
from lepl import Any, Iterate, Newline, Regexp, SkipTo, Space
# ABNF: property = name colon python_literal
name = Any(ascii_letters+digits)[1:,...] # \w+
colon = Space()[:,...] & ':' & Space()[:,...] # \s*:\s*
python_literal = Regexp(r'.+') >> ast.literal_eval
property_ = name & ~colon & python_literal > tuple
# record consists of name, timestamp and one or more properties
# ABNF: record = name colon timestamp colon 1*( NEWLINE indent property )
timestamp = Regexp(r'.*[^\s:]') >> (lambda s: (
datetime.strptime(s, "%Y-%m-%d %H:%M:%S.%f")))
record = (name & ~colon & timestamp & ~colon &
((~Newline() & ~Space()[1:,...] & property_)[1:] > dict)) > tuple
# file consists of one or more records interlaced with newlines
# ABNF: file = 1*( NEWLINE | record )
# skip unrecognized text upto new line
unknown = SkipTo(Newline()) > (lambda s: logging.error('unknown: %r' % (s,)))
it = Iterate(record | ~Newline() | ~unknown) # consume input one record at a time
it.config.no_full_first_match().no_memoize() # improve performance
iterparse = it.get_parse_file_all() # output one record at a time
pprint([lst[0] for lst in iterparse(fileinput.input()) if lst])
[('jvm',
datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
{'MemoryStatistics': [290328680, 381288448]}),
('moniData',
datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
{'Depth': [0], 'RecordsSent': [1]}),
('rdoutData',
datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
{'Depth': [0], 'RecordsSent': [0]}),
('rdoutReq',
datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
{'BytesReceived': [8184],
'RecordsReceived': [132],
'TotalRecordsReceived': 132}),
('sender',
datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
{'NumHitsCached': 0,
'NumHitsQueued': 310,
'NumHitsReceived': 2663,
'NumReadoutRequestsQueued': 0,
'NumReadoutRequestsReceived': 178,
'NumReadoutsSent': 1}),
('snData',
datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
{'Depth': [0], 'RecordsSent': [61]}),
# ... snip ...
('PyrateBufferManager',
datetime.datetime(2011, 8, 29, 17, 9, 57, 31479),
{'CurrentAquiredBuffers': 0,
'CurrentAquiredBytes': 0,
'ReturnBufferCount': 4285})]
答案 2 :(得分:1)
为什么不简单? :
import re
regx = re.compile('snData: +(\d{4}-\d\d-\d\d +\d\d:\d\d:\d\d\.\d+).+'
'Depth: +\[(\d+)\].+'
'RecordsSent: +\[(\d+)\]', re.DOTALL)
timestamp, snDataDepth, snDataRecordsSend = [], [], []
with open('data_for_plot.txt') as f:
ch = f.read()
a,b,c = regx.search(ch).groups()
timestamp.append(a)
snDataDepth.append(b)
snDataRecordsSend.append(c)
print timestamp
print snDataDepth
print snDataRecordsSend
结果
['2011-08-29 17:09:54.438864']
['0']
['0']
答案 3 :(得分:0)
如果稍微改变文件,可以使用yaml
来解析文件:
#!/usr/bin/env python
import datetime
import re
import yaml
text = open('input').read()
# transform text to make it a valid yaml
re_name = re.compile(r'^(\w+\:)\s', re.MULTILINE)
yaml_text = re.sub(re_name, r'\1\n ', text)
#
obj = yaml.load(yaml_text)
d = obj['sender'][datetime.datetime(2011, 8, 29, 17, 9, 54, 438864)]
print('number of hits: {NumHitsQueued}'.format(**d))
number of hits: 310