最近我正在处理使用python进行nginx分析工作的访问日志。
根据this
,我找到了使用shlex
按空格分割引用字符串的方法
但它真的很慢,分析2000行日志的成本超过1.2秒。我的nginx服务器每秒生成超过2500行。
所以我尝试使用re
或更多本地(和粗鲁)方式使用索引作为字符串。
代码在虚拟机中运行,2000行日志的成本大约超过0.5秒
我还有其他选择可以提高效率吗?
提前致谢
这是我的代码
import re
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080 [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def convert(line):
line = re.split('\"', line)
line_pre = re.split('\s+', line[0])
r =re.compile(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$")
http_method =r.findall(line[1])
#http_method =re.findall(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$", line[1])
if len(http_method):
http_method = http_method[0]
else:
http_method = ''
r = re.compile(r"^\s+(\d{1,3})\s+(\d+)")
code_byte = r.findall(line[2])
#code_byte = re.findall(r"^\s+(\d{1,3})\s+(\d+)", line[2])
status = int(code_byte[0][0])
bytes_sent = int(code_byte[0][1])
r = re.compile(r":\d+$")
upstream_addr = r.sub("", line_pre[4])
request_time = int(float(line_pre[0])*1000)
if line_pre[1] == '-':
upstream_response_time = -1
else:
upstream_response_time = int(float(line_pre[1])*1000)
remote_addr = line_pre[2]
host = line_pre[7].replace(' ','')
logdatetime = line_pre[5].replace('[','')
dt = datetime.datetime.strptime(logdatetime, "%d/%b/%Y:%H:%M:%S")
year = int(str(dt)[0:4])
monthday = int(str(dt)[4:10].replace("-",""))
hour = int(str(dt)[11:13])
logtime = int(str(dt)[14:16])
sec = time.mktime(dt.timetuple())
r = re.compile(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$")
request_uri = r.findall(line[1])
#request_uri = re.findall(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$", line[1])
http_referer = line[3]
user_agent = line[5]
gzip_ratio = line[7]
http_x_forwarded_for = line[9]
r = re.compile(r"^([0-9\.]+)\s+(.*)")
serad_guid = r.findall(line[11])
server_addr = serad_guid[0][0]
guid = serad_guid[0][1]
doc = {
"hour":hour,
"year":year,
"date":monthday,
"time":logtime,
"sec":sec,
"request_time":request_time,
"upstream_response_time":upstream_response_time,
"remote_addr":remote_addr,
"upstream_addr":upstream_addr,
"host":host,
"method":http_method,
"request_uri":request_uri,
#"request_protocal":"",
"status":status,
"bytes_sent":bytes_sent,
"http_referer":http_referer,
"user_agent":user_agent,
"gzip_ratio":gzip_ratio,
"http_x_forwarded_for":http_x_forwarded_for,
"server_addr":server_addr,
"guid":guid
}
return doc
t2 = time.time()
count =0
for i in range(12000):
convert(line)
count += 1
if count % 2000 == 0:
t1 = t2
t2 = time.time()
print str(t2-t1)
和
指数方式
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080 [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def pair(l):
for i in range(0, len(l), 2):
yield (l[i], l[i+1])
def convert(line):
line = line.replace(" ", "")
quotes_positions = allindices(line, "\"")
if len(quotes_positions) <= 0 or len(quotes_positions)% 2 != 0:
return None
space_positions = allindices(line, " ")
target_positions = []
for s in space_positions:
true_target = True
for qs, qe in pair(quotes_positions):
if s > qs and s < qe:
true_target = False
break
if true_target:
target_positions.append(s)
ret = []
for i in range(0, len(target_positions)):
if i + 1 == len(target_positions):
ret.append(line[target_positions[i] + 1:])
else:
ret.append(line[target_positions[i] + 1:target_positions[i + 1]])
return ret
# def allindices(string, sub, listindex=[], offset=0):
def allindices(string, sub):
listindex = list()
i = string.find(sub)
while i >= 0:
listindex.append(i)
i = string.find(sub, i + 1)
return listindex
t2 = time.time()
count =0
for i in range(12000):
convert(line)
count += 1
if count % 2000 == 0:
t1 = t2
t2 = time.time()
print str(t2-t1)
答案 0 :(得分:3)
这看起来有点像CSV;我想知道csv模块是否可以被滥用来处理它?</ p>
>>> for row in csv.reader([line], delimiter=' '):
... print repr(row)
...
['0.278', '0.264', '113.116.52.174', '-', '10.10.3.41:20080', '', '[08/Apr/2012:23:59:08', '+0800]', 'shenzhen.anjuke.com', 'GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0', '200', '10914', 'http://shenzhen.anjuke.com/prop/view/104178677', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)', '-', '-', '-', '114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E']
答案 1 :(得分:3)
刚刚根据示例行写了一个正则表达式,我实际上并不知道某些字段的含义所以我使用了占位符名称,你可以将它们重命名为更有意义的。在我的机器上,这个片段比你的第一个快4到5倍。
log_line_re = re.compile(
r"""
(?P<float1>[0-9.]+)
\s
(?P<float2>[0-9.]+)
\s
(?P<ip1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\s
(?P<field1>.+?)
\s
(?P<ip_port_1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})
\s+
\[(?P<request_date>.+?)\]
\s
(?P<host>.+?)
\s
"
(?P<http_method>[A-Z]+)
\s
(?P<request_path>.+?)
\s
HTTP/(?P<http_version>[0-9.]+)
"
\s
(?P<status_code>\d{3})
\s
(?P<number>\d+)
\s
"
(?P<referer>.+?)
"
\s
"(?P<user_agent>.+?)"
\s
"(?P<field2>.+?)"
\s
"(?P<field3>.+?)"
\s
(?P<field4>.+?)
"
(?P<ip2>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\s
(?P<request_guid>.+?)
"
""", re.VERBOSE)
def convert(line):
return log_line_re.match(line).groupdict()