我试图用perl找出最快的方法来分割CLF格式的行(在apache access.log文件中使用linke)。多年来,他们累积数百万。 以下是我到目前为止测试的内容。我的最后一次尝试已经比使用正则表达式更快。</ p>
但是 - 你觉得怎么样 - 有没有办法更快地做到这一点?
1 2 3 4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
1: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
1.2.3.4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
2: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
202 200 1.2.3.4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0
3: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
1.2.3.4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
4: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
GET / ..?,-" HTTP/1.0 13/Jun/2007:03:20:15 +0200 1.2.3.4 - - 200 202
5: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 1.2.3.4 - - 200 202
6: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
---- hit <ENTER> to start Test ----
Benchmark: timing 100000 iterations of Method 1, Method 2, Method 3,
Method 4, Method 5, Method 6...
1: 39 wallclock s(37.64usr + 0.12sys = 37.77CPU) @2647.81/s(n=100000)
2: 39 wallclock s(38.35usr + 0.19sys = 38.53CPU) @2595.18/s(n=100000)
3: 39 wallclock s(37.19usr + 0.14sys = 37.33CPU) @2678.74/s(n=100000)
4: 38 wallclock s(36.80usr + 0.08sys = 36.88CPU) @2711.57/s(n=100000)
5: 38 wallclock s(36.93usr + 0.14sys = 37.07CPU) @2697.89/s(n=100000)
6: 38 wallclock s(36.11usr + 0.16sys = 36.27CPU) @2757.10/s(n=100000)
8X ----------------
#!/usr/bin/perl -w
use strict;
use warnings;
use FileHandle;
use Date::Parse;
use Benchmark;
STDOUT->autoflush(1); #....................................... autoflush STDOUT
our $s='1.2.3.4 - - [13/Jun/2007:03:20:15 +0200] "GET / ..?,-" HTTP/1.0" 200 202';
our (@T,$host,$timestamp,$request);
print "---- test functionality -----------------------------------\n";
split1(); print join(" ",@T)."\n1: [$host] [$timestamp] [$request]\n";
split2(); print join(" ",@T)."\n2: [$host] [$timestamp] [$request]\n";
split3(); print join(" ",@T)."\n3: [$host] [$timestamp] [$request]\n";
split4(); print join(" ",@T)."\n4: [$host] [$timestamp] [$request]\n";
split5(); print join(" ",@T)."\n5: [$host] [$timestamp] [$request]\n";
split6(); print join(" ",@T)."\n6: [$host] [$timestamp] [$request]\n";
print "---- hit <ENTER> to start Test ----"; <>;
timethese (
100000,
{'1' => '&split1',
'2' => '&split2',
'3' => '&split3',
'4' => '&split4',
'5' => '&split5',
'6' => '&split6',
}
);
exit(0);
1;
sub split1
{ $host='';$timestamp='';$request='';@T=();
#----------------------------------------------------------------------------
@T = $s =~ m/^(\d+)\.(\d+)\.(\d+)\.(\d+) (\S+) (\S+) \[(.+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$/;
#----------------------------------------------------------------------------
$host=unpack("N",pack("C4",@T));
$timestamp=str2time($T[6]);
$request=join(" ",$T[7],$T[8],$T[9]);
}
sub split2
{ $host='';$timestamp='';$request='';@T=();
#----------------------------------------------------------------------------
@T=split(/ /,$s);
splice(@T,5,@T-7,join(" ",@T[5..(@T-3)]));
splice(@T,3,2 ,join(" ",@T[3..4 ]));
chomp($T[6]); $T[3]=substr($T[3],1,-1); $T[4]=substr($T[4],1,-1);
#----------------------------------------------------------------------------
$host=unpack("N",pack("C4",split(/\./,$T[0])));
$timestamp=str2time($T[3]);
$request=$T[4];
}
sub split3
{ $host='';$timestamp='';$request='';@T=();
#----------------------------------------------------------------------------
my $i; my $x=$s;
$i=rindex($x,' ');push(@T,substr($x,$i+1)); $x=substr($x,0,$i);
$i=rindex($x,' ');push(@T,substr($x,$i+1)); $x=substr($x,0,$i);
$i=index($x,' ');push(@T,substr($x,0,$i)); $x=substr($x,$i+1,-1);
$i=index($x,' ');push(@T,substr($x,0,$i)); $x=substr($x,$i+1);
$i=index($x,' ');push(@T,substr($x,0,$i)); $x=substr($x,$i+2);
$i=index($x,']');push(@T,substr($x,0,$i)); push(@T,substr($x,$i+3));
#----------------------------------------------------------------------------
$host=unpack("N",pack("C4",split(/\./,$T[2])));
$timestamp=str2time($T[5]);
$request=$T[6];
}
sub split4
{ $host='';$timestamp='';$request='';@T=();
#----------------------------------------------------------------------------
my $i; my $x=$s;
$i=rindex($x,' ');$T[6]=substr($x,$i+1); $x=substr($x,0,$i);
$i=rindex($x,' ');$T[5]=substr($x,$i+1); $x=substr($x,0,$i);
$i= index($x,' ');$T[0]=substr($x,0,$i); $x=substr($x,$i+1,-1);
$i= index($x,' ');$T[1]=substr($x,0,$i); $x=substr($x,$i+1);
$i= index($x,' ');$T[2]=substr($x,0,$i); $x=substr($x,$i+2);
$i= index($x,']');$T[3]=substr($x,0,$i); $T[4]=substr($x,$i+3);
#----------------------------------------------------------------------------
$host=unpack("N",pack("C4",split(/\./,$T[0])));
$timestamp=str2time($T[3]);
$request=$T[4];
}
sub split5
{ $host='';$timestamp='';$request='';@T=();
#----------------------------------------------------------------------------
my ($i,$j); my $x=$s;
$i=index($x,'"')+1;
$j=rindex($x,'"');
$T[0]=substr($x,$i,$j-$i);
my $a=substr($x,0,$i-3);
$i=rindex($a,'[');
$T[1]=substr($a,$i+1); $a=substr($a,0,$i-1);
$x=$a.substr($x,$j+1);
push(@T,split(/ /,$x));
#----------------------------------------------------------------------------
$request=$T[0];
$timestamp=str2time($T[1]);
$host=unpack("N",pack("C4",split(/\./,$T[2])));
}
sub split6
{ $host='';$timestamp='';$request='';@T=();
#----------------------------------------------------------------------------
my ($i,$j); my $x=$s;
$i=index($x,'[');
$j=rindex($x,'"');
$T[0]=substr($x,$i+1,26);
$T[1]=substr($x,$i+30,$j-$i-30);
push(@T,split(/ /,substr($x,0,$i-1).substr($x,$j+1)));
#----------------------------------------------------------------------------
$timestamp=str2time($T[0]);
$request=$T[1];
$host=unpack("N",pack("C4",split(/\./,$T[2])));
}
8X ----------------
答案 0 :(得分:1)
为了建立amon找到的结果,str2time是瓶颈,我(任意)选择进行第一次拆分并使用str2time和Time::Piece进行测试,实际上它更快。我还没有完成配置文件以查看滞后是否仍在解析器中(或现在使用OO模块)。
#!/usr/bin/perl
use strict;
use warnings;
use FileHandle;
use Date::Parse;
use Time::Piece;
use Benchmark;
STDOUT->autoflush(1); #....................................... autoflush STDOUT
our $s='1.2.3.4 - - [13/Jun/2007:03:20:15 +0200] "GET / ..?,-" HTTP/1.0" 200 202';
our (@T,$host,$timestamp,$request);
print "---- test functionality -----------------------------------\n";
parse(); print join(" ",@T)."\n1: [$host] [$timestamp] [$request]\n";
piece(); print join(" ",@T)."\n2: [$host] [$timestamp] [$request]\n";
print "---- hit <ENTER> to start Test ----"; <>;
timethese (
100000,
{
'1' => \&parse,
'2' => \&piece,
}
);
exit(0);
1;
sub parse
{ $host='';$timestamp='';$request='';@T=();
#----------------------------------------------------------------------------
@T = $s =~ m/^(\d+)\.(\d+)\.(\d+)\.(\d+) (\S+) (\S+) \[(.+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$/;
#----------------------------------------------------------------------------
$host=unpack("N",pack("C4",@T));
$timestamp=str2time($T[6]);
$request=join(" ",$T[7],$T[8],$T[9]);
}
sub piece
{ $host='';$timestamp='';$request='';@T=();
#----------------------------------------------------------------------------
@T = $s =~ m/^(\d+)\.(\d+)\.(\d+)\.(\d+) (\S+) (\S+) \[(.+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$/;
#----------------------------------------------------------------------------
$host=unpack("N",pack("C4",@T));
$timestamp=Time::Piece->strptime($T[6], '%d/%b/%Y:%H:%M:%S %z')->epoch;
$request=join(" ",$T[7],$T[8],$T[9]);
}
在我的功能不足的上网本上,我得到了:
---- test functionality -----------------------------------
1 2 3 4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
1: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
1 2 3 4 - - 13/Jun/2007:03:20:15 +0200 GET / ..?,-" HTTP/1.0 200 202
2: [16909060] [1181697615] [GET / ..?,-" HTTP/1.0]
---- hit <ENTER> to start Test ----
Benchmark: timing 100000 iterations of 1, 2...
1: 29 wallclock secs (27.58 usr + 1.03 sys = 28.61 CPU) @ 3495.28/s (n=100000)
2: 11 wallclock secs (11.25 usr + 0.00 sys = 11.25 CPU) @ 8888.89/s (n=100000)
答案 1 :(得分:1)
最后我发现了这种方法
比第一次尝试快了近4.5倍 并且每分钟分裂约1.000.000个CLF线。 如果修改了timegm函数,它可能会更快。
#!/usr/bin/perl -w
use strict;
use warnings;
use Date::Parse;
use Time::Piece;
use Time::Local 'timegm_nocheck';
use Benchmark;
our %midx = ('Jan'=>0,'Feb'=>1,'Mar'=>2,'Apr'=>3,'May'=>4,'Jun'=>5,
'Jul'=>6,'Aug'=>7,'Sep'=>8,'Oct'=>9,'Nov'=>10,'Dec'=>11);
our $re = qr/\A
(\d+)\.(\d+)\.(\d+)\.(\d+)
[ ] (\S+)
[ ] (\S+)
[ ] \[(\d+)\/(\S+)\/(\d+):(\d+):(\d+):(\d+) [ ] (\S+)\]
[ ] "(\S+) [ ] (.*?) [ ] (\S+)"
[ ] (\S+)
[ ] (\S+)
\z/x;
my $s='1.2.3.4 - - [13/Jun/2007:03:20:15 +0200] "GET / ..?,-" HTTP/1.0" 200 202';
print "[".join('],[',split1ST($s))."]\n";
print "[".join('],[',splitCLF($s))."]\n";
[16909060],[1181697615],[/ ..?, - “],[GET],[HTTP / 1.0],[200],[202],[ - ],[ - ]
[16909060],[1181697615],[/ ..?, - “],[GET],[HTTP / 1.0],[200],[202],[ - ],[ - ]
print "---- hit <ENTER> to start Test ----"; <>;
timethese (
1000000,
{ 'split1ST' => '&split1ST($s)',
'splitCLF' => '&splitCLF($s)',
}
);
基准测试:计划split1ST,splitCLF的1000000次迭代...
split1ST:338 wallclock secs(329.54 usr + 0.30 sys = 329.83 CPU)@ 3031.85 / s(n = 1000000)
splitCLF:76个挂钟秒(73.79 usr + 0.16 sys = 73.94 CPU)@ 13523.75 / s(n = 1000000)
=&GT; splitCLF比第一次尝试快4,46倍
exit(0);
1;
sub split1ST
{ @T = $s =~ m/^(\d+)\.(\d+)\.(\d+)\.(\d+) (\S+) (\S+) \[(.+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$/;
return ( unpack("N",pack("C4",@T)), #.............................. host-IPv4
str2time($7), #........................................... timestamp
$9,$8,$10,$11,$12,$5,$6) # request,method,pro,sta,bytes,authusr,usr
}
sub splitCLF
{ shift =~ $re;
return ( ((((($1<<8)|$2)<<8)|$3)<<8)|$4, #......................... host-IPv4
Time::Local::timegm_nocheck($12,$11,$10,$7,$midx{$8},$9)-$13*36, #ts
$15,$14,$16,$17,$18,$5,$6) #request,method,pro,sta,bytes,authusr,usr
}
1;
答案 2 :(得分:0)
我花了一个小时摆弄正则表达式,围绕splice
和substr
恐怖,甚至是一些C代码。然后,我做了一件至关重要的事情:
# set the benchmark iterations down to ~ 1E4
$ perl -d:NYTProf the-script.pl
$ nytprofhtml
# open ./nytprof/index.html in browser
我分析了代码(Devel::NYTProf)。非惊喜:解析字符串花了很少的时间。 split1
中的正则表达式应用总共耗时约144毫秒。但是,解析日期在str2time
中积累了3.39秒。这几乎是1:25的关系!
过早优化是万恶之源。 - D. Knuth的
使用漂亮,可读的正则表达式,如
my $split1_1_regex = qr/\A
(\d+)\.(\d+)\.(\d+)\.(\d+)
[ ] (\S+)
[ ] (\S+)
[ ] \[( [^\]]+ )\]
[ ] "(\S+ [ ] .*? [ ] \S+)"
[ ] (\S+)
[ ] (\S+)
\z/x;
这与你的(r)索引/ substr恐怖一样快,但是在某种程度上是自我记录的,并且当然更容易调试。这符合干净,惯用Perl可能是最快的Perl的经验。
然后,您可以选择接受str2time
缓慢,或尝试优化它。如果您管理可证明的加速,您可能需要考虑发送补丁upstream。您还可以尝试使用其他库,或者编写自己的str2time
函数,该函数针对您的特殊用例进行了优化。