Question

我正在研究哪一个是更快的二进制文件阅读器：C ++的ifstream :: read或C的fread。

根据互联网，包括类似问题，没有太大区别，所以我决定挖杓子。

我使用了一个1.22gb的pcap文件，其中包含大约1,377,000个数据包。这两个程序都是使用mingw32-g ++编译的，没有优化。

头结构是根据wireshark的wiki - libpcap文件结构定义的： https://wiki.wireshark.org/Development/LibpcapFileFormat

这是C代码：

#include <stdio.h>
#include <stdlib.h>
#include <Winsock2.h>

/* definition of structs: pcap_global_header, pcap_packet_header, ethernet_header, ipv4_header, tcp_header */

int main()
{
    int count = 0, bytes_read;

    /* open file */
    FILE * file = fopen("test.pcap", "rb");

    /* read file header */
    struct pcap_global_header gheader;

    fread(&gheader, sizeof(char), sizeof(struct pcap_global_header), file);

    // if not ethernet type
    if(gheader.network != 1)
    {
        printf("not ethernet !\n");
        return 1;
    }

    /* read packets */
    char *buffer = (char*)malloc(gheader.snaplen);

    struct pcap_packet_header pheader;
    struct ether_header eth;
    struct ipv4_header ip;
    struct tcp_header tcp;

    fread(&pheader, sizeof(char), sizeof(struct pcap_packet_header), file);

    while(!feof(file))
    {
        ++count;

        bytes_read = fread(&eth, sizeof(char), sizeof(struct ether_header), file);

        // ip
        if(eth.type == 0x08)
        {
            bytes_read += fread(&ip, sizeof(char), sizeof(struct ipv4_header), file);

            //tcp
            if( ip.protocol == 0x06 )
            {
                bytes_read += fread(&tcp, sizeof(char), sizeof(struct tcp_header), file);
            }
        }

        //read rest of the packet
        fread(buffer, sizeof(char), pheader.incl_len - bytes_read, file);

        // read next packet's header
        fread(&pheader, sizeof(char), sizeof(struct pcap_packet_header), file);
    }

    printf("(C) total packets: %d\n", count);

    return 0;
}

这是C ++代码：

#include <iostream>
#include <fstream>
#include <memory>

#include <Winsock2.h>

/* definition of structs: pcap_global_header, pcap_packet_header, ethernet_header, ipv4_header, tcp_header */

int main()
{
    int count_packets = 0, bytes_read;

    /* open file */
    std::ifstream file("test.pcap", std::fstream::binary | std::fstream::in);

    /* read file header */
    struct pcap_global_header gheader;

    file.read((char*)&gheader, sizeof(struct pcap_global_header));

    // if not ethernet type
    if(gheader.network != 1)
    {
        printf("not ethernet !\n");
        return 1;
    }

    /* read packets */
    char *buffer = std::allocator<char>().allocate(gheader.snaplen);

    struct pcap_packet_header pheader;
    struct ether_header eth;
    struct ipv4_header ip;
    struct tcp_header tcp;

    file.read((char*)&pheader, sizeof(pcap_packet_header));

    while(!file.eof())
    {
        ++count_packets;

        file.read((char*)&eth, sizeof(struct ether_header));
        bytes_read = sizeof(struct ether_header);

        // ip
        if(eth.type == 0x08)
        {
            file.read((char*)&ip, sizeof(struct ipv4_header));
            bytes_read += sizeof(struct ipv4_header);

            //tcp
            if( ip.protocol == 0x06 )
            {
                file.read((char*)&tcp, sizeof(struct tcp_header));
                bytes_read += sizeof(struct tcp_header);
            }
        }

        // read rest of the packet
        file.read(buffer, pheader.incl_len - bytes_read);

        // read next packet's header
        file.read((char*)&pheader, sizeof(pcap_packet_header));
    }

    std::cout << "(C++) total packets :" << count_packets << std::endl;

    return 0;
}

结果非常令人失望：

C代码结果：

(C) total packets: 1377065

Process returned 0 (0x0)   execution time : 1.031 s
Press any key to continue.

C ++代码结果：

(C++) total packets :1377065

Process returned 0 (0x0)   execution time : 3.172 s
Press any key to continue.

显然，我运行了几次每个版本，所以，我正在寻找一种更快的方法来使用C ++读取文件。

Answer 1

std::ifstream file; char buf[1024]; file.rdbuf()->pubsetbuf(buf, sizeof buf);将数据从内部缓冲区复制到缓冲区。它导致性能的主要差异。您可以尝试克服它并使用您自己的pubsetbuf替换内部缓冲区：

ifstream

问题是此函数是实现定义的，在大多数情况下，您仍然需要使用额外的数据副本。

在您的情况下，您不需要<cstdio>的全部功能，因此为了提高性能和简洁性，建议您使用{{1}}。

Answer 2

fread()应该总是更快，因为它直接将字节读入缓冲区而无需额外处理（这里不需要）。

此外，最好一次读取整个数据包，而不是每个数据包调用fread() 4次。然后，您可以在缓冲区中使用ether_header*。

使用mmap()代替fread()可以为您提供额外的加速（无需将数据从内核模式复制到用户模式缓冲区）。对于Windows，请参阅CreateFileMapping()和MapViewOfFile() - 这允许您直接使用指针访问文件内容，就像它是一个大内存缓冲区一样。

性能比较 - pcap文件读取：C ++的ifstream VS C的fread

2 个答案: