我在一个巨大的文件中有多个HTTP标头,用一个空行隔开。
Host
Connection
Accept
From
User-Agent
Accept-Encoding
Host
Connection
Accept
From
User-Agent
Accept-Encoding
X-Forwarded-For
cookie
Cache-Control
referer
x-fb-sim-hni
Host
Accept
user-agent
x-fb-net-sid
x-fb-net-hni
X-Purpose
accept-encoding
x-fb-http-engine
Connection
User-Agent
Host
Connection
Accept-Encoding
我有大约10,000,000个标题用空行分隔。 如果我想发现趋势,比如标题顺序,我想将汇总标题添加到一行(我如何汇总以空行结尾的行并为所有标题分别执行此操作?):
主机,连接,接受,从,用户代理,接受编码
并跟随:uniq -c|sort -nk1,
所以我可以收到:
197897 Host,Connection,Accept,From,User-Agent,Accept-Encoding
8732233 User-Agent,Host,Connection,Accept-Encoding
解析大量文件并获取数据的最佳方法和最有效方法是什么?
感谢提示。
答案 0 :(得分:2)
对sorted_in使用GNU awk,您只需要:
$ cat tst.awk
BEGIN { RS=""; FS="\n"; OFS="," }
{ $1=$1; cnt[$0]++ }
END {
PROCINFO["sorted_in"] = "@val_num_desc"
for (rec in cnt) {
print cnt[rec] " " rec
}
}
在您发布的示例(1.5milGETs.txt)上运行dos2unix
之后:
$ time awk -f tst.awk 1.5milGETs.txt > ou.awk
real 0m4.898s
user 0m4.758s
sys 0m0.108s
$ head -10 ou.awk
71639 Host,Accept,User-Agent,Pragma,Connection
70975 Host,ros-SecurityFlags,ros-SessionTicket,ros-Challenge,ros-HeadersHmac,Scs-Ticket,If-Modified-Since,User-Agent
40781 Host,Accept,User-Agent,Pragma,nnCoection,Connection,X-Forwarded-For
35485 Accept,ros-SecurityFlags,ros-SessionTicket,ros-Challenge,ros-HeadersHmac,Scs-Ticket,If-Modified-Since,User-Agent,Accept-Language,UA-CPU,Accept-Encoding,Host,Connection
34005 User-Agent,Host,Connection,Accept-Encoding
30668 Host,User-Agent,Accept-Encoding,Connection
25547 Host,Accept,Accept-Language,Connection,Accept-Encoding,User-Agent
22581 Host,User-Agent,Accept,Accept-Encoding
19311 Host,Connection,Accept,From,User-Agent,Accept-Encoding
14694 Host,Connection,User-Agent,Accept,Referer,Accept-Encoding,Accept-Language,Cookie
答案 1 :(得分:1)
这是一个用(POSIX)C写的答案,AFAICT按照OP的要求行事。 C解决方案似乎比基于AWK的解决方案更快。这可能有用也可能没用,这一切都取决于程序运行的频率和输入数据。
主要内容:
无论如何,这是代码。 (免责声明:它写在SO上可以发表)
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
struct uniq {
char *val;
size_t count;
};
struct entry {
char *val;
};
// Some globals
size_t g_filesize;
char* g_baseaddr;
struct entry *g_entries;
size_t g_entrysize, g_entrycapacity;
struct uniq *g_unique;
size_t g_uniquesize, g_uniquecapacity;
static inline void mapfile(const char *filename)
{
int fd;
struct stat st;
if ((fd = open(filename, O_RDWR)) == -1 || fstat(fd, &st)) {
perror(filename);
exit(__LINE__);
}
g_baseaddr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (g_baseaddr == (void *)MAP_FAILED) {
perror(filename);
close(fd);
exit(__LINE__);
}
close(fd);
g_filesize = st.st_size;
}
// Guestimate how many entries we have. We do this only to avoid early
// reallocs, so this isn't that important. Let's say 100 bytes per entry.
static inline void setup_entry_table(void)
{
g_entrycapacity = g_filesize / 100;
g_entrysize = 0;
size_t cb = sizeof *g_entries * g_entrycapacity;
if ((g_entries = malloc(cb)) == NULL)
exit(__LINE__);
memset(g_entries, 0, cb);
}
static inline void realloc_if_needed(void)
{
if (g_entrysize == g_entrycapacity) {
size_t newcap = g_entrycapacity * 2;
size_t cb = newcap * sizeof *g_entries;
struct entry *tmp = realloc(g_entries, cb);
if (tmp == NULL)
exit(__LINE__);
g_entries = tmp;
g_entrycapacity = newcap;
}
}
static inline void add_entry(char *p)
{
realloc_if_needed();
g_entries[g_entrysize].val = p;
g_entrysize++;
}
// Convert input data to proper entries by replacing \n with either
// ',' or \0. We add \0 to separate the entries.
static inline void convert_to_entries(void)
{
char *endaddr = g_baseaddr + g_filesize;
char *prev, *s = g_baseaddr;
// First entry
prev = s;
while(s < endaddr) {
char *nl = strchr(s, '\n');
if (nl == s) {
if (nl - prev > 0) // Skip empty strings
add_entry(prev);
*nl = '\0'; // Terminate entry
s = nl + 1; // Skip to first byte after \0
prev = s; // This is the start of the 'previous' record
}
else {
*nl = ','; // Replace \n with comma
s = nl + 1; // Move pointer forward (optimization).
if (*s == '\n')
*(s - 1) = '\0';// Don't add trailing comma
}
}
if (prev < s)
add_entry(prev); // Don't forget last entry
}
static int entrycmp(const void *v1, const void *v2)
{
const struct entry *p1 = v1, *p2 = v2;
return strcmp(p1->val, p2->val);
}
// Sort the entries so the pointers point to a sorted list of strings.
static inline void sort_entries(void)
{
qsort(g_entries, g_entrysize, sizeof *g_entries, entrycmp);
}
// We keep things really simple and allocate one unique entry for each
// entry. That's the worst case anyway and then we don't have to test
// for reallocation.
static inline void setup_unique_table(void)
{
size_t cb = sizeof *g_unique * g_entrysize;
if ((g_unique = malloc(cb)) == NULL)
exit(__LINE__);
g_uniquesize = 0;
g_uniquecapacity = g_entrysize;
}
static inline void add_unique(char *s)
{
g_unique[g_uniquesize].val = s;
g_unique[g_uniquesize].count = 1;
g_uniquesize++;
}
// Now count and skip duplicate entries.
// How? Just iterate over the entries table and find duplicates.
// For each duplicate, increment count. For each non-dup,
// add a new entry.
static inline void find_unique_entries(void)
{
char *last = g_entries[0].val;
add_unique(last);
for (size_t i = 1; i < g_entrysize; i++) {
if (strcmp(g_entries[i].val, last) == 0) {
g_unique[g_uniquesize - 1].count++; // Inc last added\'s count
}
else {
last = g_entries[i].val;
add_unique(last);
}
}
}
static inline void print_unique_entries(void)
{
for (size_t i = 0; i < g_uniquesize; i++)
printf("%zu %s\n", g_unique[i].count, g_unique[i].val);
}
static inline void print_entries(void)
{
for (size_t i = 0; i < g_entrysize; i++)
printf("%s\n", g_entries[i].val);
}
static int uniquecmp(const void *v1, const void *v2)
{
const struct uniq *p1 = v1, *p2 = v2;
return (int)p2->count - (int)p1->count;
}
static inline void sort_unique_entries(void)
{
qsort(g_unique, g_uniquesize, sizeof *g_unique, uniquecmp);
}
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "USAGE: %s filename\n", argv[0]);
exit(__LINE__);
}
mapfile(argv[1]);
setup_entry_table();
convert_to_entries();
if (g_entrysize == 0) // no entries in file.
exit(0);
sort_entries();
setup_unique_table();
find_unique_entries();
sort_unique_entries();
if (0) print_entries();
if (1) print_unique_entries();
// cleanup
free(g_entries);
free(g_unique);
munmap(g_baseaddr, g_filesize);
exit(0);
}
答案 2 :(得分:0)
就个人而言,我会使用C程序,也存在其他替代方案。这是一个折叠线条的awk片段。不完美,但应该让你开始:)
$cat foo.awk
// {
if (NF == 0)
printf("\n");
else
printf("%s ", $0);
}
$ awk -f foo.awk < lots_of_data | sort | uniq -c | sort -nk1
最后一句话将“永远”,这就是为什么C程序可能是一个很好的选择。这主要取决于您运行命令的频率。
答案 3 :(得分:0)
如果你有足够的内存(10M记录,在你的样本中每条记录大约有80个字符,800MB,如果你在计算它们,我会假设有很多重复)你可以将记录哈希到内存并在散列时计数:
$ awk 'BEGIN{ RS=""; OFS=","}
{
b="" # reset buffer b
for(i=1;i<=NF;i++) # for every header element in record
b=b (b==""?"":OFS) $i # buffer them and comma separate
a[b]++ # hash to a, counting
}
END { # in the end
for(i in a) # go thru the a hash
print a[i] " " i} # print counts and records
' file
1 Host,Connection,Accept,From,User-Agent,Accept-Encoding
1 cookie,Cache-Control,referer,x-fb-sim-hni,Host,Accept,user-agent,x-fb-net-sid,x-fb-net-hni,X-Purpose,accept-encoding,x-fb-http-engine,Connection
1 User-Agent,Host,Connection,Accept-Encoding
1 Host,Connection,Accept,From,User-Agent,Accept-Encoding,X-Forwarded-For
由于i in a
的性质,输出顺序是随机的,因此请随后按顺序输出输出。
修改强>:
正如@dawg在评论中指出的那样,$1=$1
足以将记录重建为以逗号分隔的形式:
$ awk 'BEGIN{ RS=""; OFS=","}
{
$1=$1 # rebuild the record
a[$0]++ # hash $0 to a, counting
}
END { # in the end
for(i in a) # go thru the a hash
print a[i] " " i} # print counts and records
' file
答案 4 :(得分:0)
使用您的1.5milGETs.txt
文件(并将三重\n\n\n
转换为\n\n
到单独的区块),您可以在段落模式中使用ruby
:
$ ruby -F'\n' -lane 'BEGIN{h=Hash.new(0); $/=""
def commafy(n)
n.to_s.reverse.gsub(/...(?=.)/,"\\&,").reverse
end
}
h[$F.join(",")]+=1
# p $_
END{ printf "Total blocks: %s\n", commafy(h.values.sum)
h2=h.sort_by {|k,v| -v}
h2[0..10].map {|k,v| printf "%10s %s\n", commafy(v), k}
}' 1.5milGETs.txt
打印总块数,将它们排序为大>&gt;小,打印前10个。
打印:
Total blocks: 1,262,522
71,639 Host,Accept,User-Agent,Pragma,Connection
70,975 Host,ros-SecurityFlags,ros-SessionTicket,ros-Challenge,ros-HeadersHmac,Scs-Ticket,If-Modified-Since,User-Agent
40,781 Host,Accept,User-Agent,Pragma,nnCoection,Connection,X-Forwarded-For
35,485 Accept,ros-SecurityFlags,ros-SessionTicket,ros-Challenge,ros-HeadersHmac,Scs-Ticket,If-Modified-Since,User-Agent,Accept-Language,UA-CPU,Accept-Encoding,Host,Connection
34,005 User-Agent,Host,Connection,Accept-Encoding
30,668 Host,User-Agent,Accept-Encoding,Connection
25,547 Host,Accept,Accept-Language,Connection,Accept-Encoding,User-Agent
22,581 Host,User-Agent,Accept,Accept-Encoding
19,311 Host,Connection,Accept,From,User-Agent,Accept-Encoding
14,694 Host,Connection,User-Agent,Accept,Referer,Accept-Encoding,Accept-Language,Cookie
12,290 Host,User-Agent,Accept-Encoding
对于6岁的Mac,这需要大约8秒钟。
Awk将快3倍,完全适合这项工作。
Ruby将为您提供更多输出选项并更轻松地分析数据。您可以创建交互式HTML文档;输出JSON,引用csv,xml简单;与数据库交互;在语句中反转键和值;过滤;等。