Question

我在一个巨大的文件中有多个HTTP标头，用一个空行隔开。

Host
Connection
Accept
From
User-Agent
Accept-Encoding

Host
Connection
Accept
From
User-Agent
Accept-Encoding
X-Forwarded-For

cookie
Cache-Control
referer
x-fb-sim-hni
Host
Accept
user-agent
x-fb-net-sid
x-fb-net-hni
X-Purpose
accept-encoding
x-fb-http-engine
Connection

User-Agent
Host
Connection
Accept-Encoding

我有大约10,000,000个标题用空行分隔。如果我想发现趋势，比如标题顺序，我想将汇总标题添加到一行（我如何汇总以空行结尾的行并为所有标题分别执行此操作？）：

主机，连接，接受，从，用户代理，接受编码

并跟随：uniq -c|sort -nk1, 所以我可以收到：

197897 Host,Connection,Accept,From,User-Agent,Accept-Encoding
8732233 User-Agent,Host,Connection,Accept-Encoding

解析大量文件并获取数据的最佳方法和最有效方法是什么？

感谢提示。

Answer 1

对sorted_in使用GNU awk，您只需要：

$ cat tst.awk
BEGIN { RS=""; FS="\n"; OFS="," }
{ $1=$1; cnt[$0]++ }
END {
    PROCINFO["sorted_in"] = "@val_num_desc"
    for (rec in cnt) {
        print cnt[rec] " " rec
    }
}

在您发布的示例（1.5milGETs.txt）上运行dos2unix之后：

$ time awk -f tst.awk 1.5milGETs.txt > ou.awk

real    0m4.898s
user    0m4.758s
sys     0m0.108s


$ head -10 ou.awk
71639 Host,Accept,User-Agent,Pragma,Connection
70975 Host,ros-SecurityFlags,ros-SessionTicket,ros-Challenge,ros-HeadersHmac,Scs-Ticket,If-Modified-Since,User-Agent
40781 Host,Accept,User-Agent,Pragma,nnCoection,Connection,X-Forwarded-For
35485 Accept,ros-SecurityFlags,ros-SessionTicket,ros-Challenge,ros-HeadersHmac,Scs-Ticket,If-Modified-Since,User-Agent,Accept-Language,UA-CPU,Accept-Encoding,Host,Connection
34005 User-Agent,Host,Connection,Accept-Encoding
30668 Host,User-Agent,Accept-Encoding,Connection
25547 Host,Accept,Accept-Language,Connection,Accept-Encoding,User-Agent
22581 Host,User-Agent,Accept,Accept-Encoding
19311 Host,Connection,Accept,From,User-Agent,Accept-Encoding
14694 Host,Connection,User-Agent,Accept,Referer,Accept-Encoding,Accept-Language,Cookie

Answer 2

这是一个用（POSIX）C写的答案，AFAICT按照OP的要求行事。 C解决方案似乎比基于AWK的解决方案更快。这可能有用也可能没用，这一切都取决于程序运行的频率和输入数据。

主要内容：

程序存储器映射输入文件并改变映射的副本。
它会在适当的位置用逗号替换换行符，并且带有nul字符的换行符，用于分隔输入中的每个条目文件。 IOW，foo \ nbar \ n \ nbaz \ n变为foo，bar \ 0baz \ 0。
该程序还构建了一个条目表，它只是一个数组 char指针进入内存映射文件。
程序使用标准字符串函数对条目进行排序，但仅移动指针值，而不是实际数据
然后程序创建一个新的唯一条目数组，并计算每个字符串的实例数。（这部分可能会更快一点）
然后按降序对唯一条目数组进行排序
最后，程序打印唯一数组的内容

无论如何，这是代码。（免责声明：它写在SO上可以发表）

#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>

struct uniq {
    char *val;
    size_t count;
};

struct entry {
    char *val;
};

// Some globals
size_t g_filesize;
char*  g_baseaddr;

struct entry *g_entries;
size_t g_entrysize, g_entrycapacity;

struct uniq *g_unique;
size_t g_uniquesize, g_uniquecapacity;

static inline void mapfile(const char *filename)
{
    int fd;
    struct stat st;

    if ((fd = open(filename, O_RDWR)) == -1 || fstat(fd, &st)) {
        perror(filename);
        exit(__LINE__);
    }

    g_baseaddr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
    if (g_baseaddr == (void *)MAP_FAILED) {
        perror(filename);
        close(fd);
        exit(__LINE__);
    }

    close(fd);
    g_filesize = st.st_size;
}

// Guestimate how many entries we have. We do this only to avoid early
// reallocs, so this isn't that important. Let's say 100 bytes per entry.
static inline void setup_entry_table(void)
{
    g_entrycapacity = g_filesize / 100; 
    g_entrysize = 0;

    size_t cb = sizeof *g_entries * g_entrycapacity;
    if ((g_entries = malloc(cb)) == NULL)
        exit(__LINE__);

    memset(g_entries, 0, cb);
}

static inline void realloc_if_needed(void)
{
    if (g_entrysize == g_entrycapacity) {
        size_t newcap = g_entrycapacity * 2;
        size_t cb = newcap * sizeof *g_entries;

        struct entry *tmp = realloc(g_entries, cb);
        if (tmp == NULL)
            exit(__LINE__);

        g_entries = tmp;
        g_entrycapacity = newcap;
    }
}

static inline void add_entry(char *p)
{
    realloc_if_needed();
    g_entries[g_entrysize].val = p;
    g_entrysize++;
}

// Convert input data to proper entries by replacing \n with either
// ',' or \0. We add \0 to separate the entries.
static inline void convert_to_entries(void)
{
    char *endaddr = g_baseaddr + g_filesize;
    char *prev, *s = g_baseaddr;

    // First entry
    prev = s;

    while(s < endaddr) {
        char *nl = strchr(s, '\n');
        if (nl == s) {
            if (nl - prev > 0) // Skip empty strings
                add_entry(prev);
            *nl = '\0';         // Terminate entry
            s = nl + 1;         // Skip to first byte after \0
            prev = s;           // This is the start of the 'previous' record
        }
        else {
            *nl = ',';          // Replace \n with comma
            s = nl + 1;         // Move pointer forward (optimization). 
            if (*s == '\n')
                *(s - 1) = '\0';// Don't add trailing comma
        }
    }

    if (prev < s)
        add_entry(prev);        // Don't forget last entry
}

static int entrycmp(const void *v1, const void *v2)
{
    const struct entry *p1 = v1, *p2 = v2;

    return strcmp(p1->val, p2->val);
}

// Sort the entries so the pointers point to a sorted list of strings.
static inline void sort_entries(void)
{
    qsort(g_entries, g_entrysize, sizeof *g_entries, entrycmp);
}

// We keep things really simple and allocate one unique entry for each
// entry. That's the worst case anyway and then we don't have to test
// for reallocation.
static inline void setup_unique_table(void)
{
    size_t cb = sizeof *g_unique * g_entrysize;
    if ((g_unique = malloc(cb)) == NULL)
        exit(__LINE__);

    g_uniquesize = 0;
    g_uniquecapacity = g_entrysize;
}

static inline void add_unique(char *s)
{
    g_unique[g_uniquesize].val = s;
    g_unique[g_uniquesize].count = 1;
    g_uniquesize++;
}

// Now count and skip duplicate entries. 
// How? Just iterate over the entries table and find duplicates.
// For each duplicate, increment count. For each non-dup,
// add a new entry.
static inline void find_unique_entries(void)
{
    char *last = g_entries[0].val;
    add_unique(last);

    for (size_t i = 1; i < g_entrysize; i++) {
        if (strcmp(g_entries[i].val, last) == 0) {
            g_unique[g_uniquesize - 1].count++; // Inc last added\'s count 
        }
        else {
            last = g_entries[i].val;
            add_unique(last);
        }
    }
}

static inline void print_unique_entries(void)
{
    for (size_t i = 0; i < g_uniquesize; i++)
        printf("%zu %s\n", g_unique[i].count, g_unique[i].val);
}

static inline void print_entries(void)
{
    for (size_t i = 0; i < g_entrysize; i++)
        printf("%s\n", g_entries[i].val);
}

static int uniquecmp(const void *v1, const void *v2)
{
    const struct uniq *p1 = v1, *p2 = v2;
    return (int)p2->count - (int)p1->count;
}

static inline void sort_unique_entries(void)
{
    qsort(g_unique, g_uniquesize, sizeof *g_unique, uniquecmp);
}

int main(int argc, char *argv[])
{
    if (argc != 2) {
        fprintf(stderr, "USAGE: %s filename\n", argv[0]);
        exit(__LINE__);
    }

    mapfile(argv[1]);
    setup_entry_table();
    convert_to_entries();

    if (g_entrysize == 0) // no entries in file.
        exit(0);

    sort_entries();
    setup_unique_table();
    find_unique_entries();
    sort_unique_entries();

    if (0) print_entries();
    if (1) print_unique_entries();

    // cleanup
    free(g_entries);
    free(g_unique);
    munmap(g_baseaddr, g_filesize);
    exit(0);
}

Answer 3

就个人而言，我会使用C程序，也存在其他替代方案。这是一个折叠线条的awk片段。不完美，但应该让你开始：）

$cat foo.awk
// {
    if (NF == 0)
        printf("\n");
    else
        printf("%s ", $0);
}
$ awk -f foo.awk < lots_of_data | sort | uniq -c | sort -nk1

最后一句话将“永远”，这就是为什么C程序可能是一个很好的选择。这主要取决于您运行命令的频率。

Answer 4

如果你有足够的内存（10M记录，在你的样本中每条记录大约有80个字符，800MB，如果你在计算它们，我会假设有很多重复）你可以将记录哈希到内存并在散列时计数：

$ awk 'BEGIN{ RS=""; OFS=","} 
{
    b=""                       # reset buffer b
    for(i=1;i<=NF;i++)         # for every header element in record
        b=b (b==""?"":OFS) $i  # buffer them and comma separate
    a[b]++                     # hash to a, counting
}
END {                          # in the end
    for(i in a)                # go thru the a hash
        print a[i] " " i}      # print counts and records
' file
1 Host,Connection,Accept,From,User-Agent,Accept-Encoding
1 cookie,Cache-Control,referer,x-fb-sim-hni,Host,Accept,user-agent,x-fb-net-sid,x-fb-net-hni,X-Purpose,accept-encoding,x-fb-http-engine,Connection
1 User-Agent,Host,Connection,Accept-Encoding
1 Host,Connection,Accept,From,User-Agent,Accept-Encoding,X-Forwarded-For

由于i in a的性质，输出顺序是随机的，因此请随后按顺序输出输出。

修改：

正如@dawg在评论中指出的那样，$1=$1足以将记录重建为以逗号分隔的形式：

$ awk 'BEGIN{ RS=""; OFS=","} { $1=$1 # rebuild the record a[$0]++ # hash $0 to a, counting } END { # in the end for(i in a) # go thru the a hash print a[i] " " i} # print counts and records ' file

Answer 5

使用您的1.5milGETs.txt文件（并将三重\n\n\n转换为\n\n到单独的区块），您可以在段落模式中使用ruby：

$ ruby -F'\n' -lane 'BEGIN{h=Hash.new(0); $/=""
                                       def commafy(n)
                                             n.to_s.reverse.gsub(/...(?=.)/,"\\&,").reverse
                                         end
                                         }
                                         h[$F.join(",")]+=1
                                        # p $_
                              END{ printf "Total blocks: %s\n", commafy(h.values.sum)
                                      h2=h.sort_by {|k,v| -v}
                                      h2[0..10].map {|k,v| printf "%10s %s\n", commafy(v), k}
                                      }' 1.5milGETs.txt

打印总块数，将它们排序为大>＆gt;小，打印前10个。

打印：

Total blocks: 1,262,522
    71,639 Host,Accept,User-Agent,Pragma,Connection
    70,975 Host,ros-SecurityFlags,ros-SessionTicket,ros-Challenge,ros-HeadersHmac,Scs-Ticket,If-Modified-Since,User-Agent
    40,781 Host,Accept,User-Agent,Pragma,nnCoection,Connection,X-Forwarded-For
    35,485 Accept,ros-SecurityFlags,ros-SessionTicket,ros-Challenge,ros-HeadersHmac,Scs-Ticket,If-Modified-Since,User-Agent,Accept-Language,UA-CPU,Accept-Encoding,Host,Connection
    34,005 User-Agent,Host,Connection,Accept-Encoding
    30,668 Host,User-Agent,Accept-Encoding,Connection
    25,547 Host,Accept,Accept-Language,Connection,Accept-Encoding,User-Agent
    22,581 Host,User-Agent,Accept,Accept-Encoding
    19,311 Host,Connection,Accept,From,User-Agent,Accept-Encoding
    14,694 Host,Connection,User-Agent,Accept,Referer,Accept-Encoding,Accept-Language,Cookie
    12,290 Host,User-Agent,Accept-Encoding

对于6岁的Mac，这需要大约8秒钟。

Awk将快3倍，完全适合这项工作。

Ruby将为您提供更多输出选项并更轻松地分析数据。您可以创建交互式HTML文档;输出JSON，引用csv，xml简单;与数据库交互;在语句中反转键和值;过滤;等。

在bash中运行多行的统计信息

5 个答案: