Question

我写了一个小测试来比较boost file_mapping和std::ofstream之间的文件写入操作。我的印象是file_mapping性能会更好，但显然并非如此。

有人能解释为什么我会用std::ofstream获得更好的数字吗？

[编辑]：所以我做了基准测试的分析，发现boost::iostreams::detail::direct_streambuf花了很多时间复制字节。我添加了一个使用std::copy_n代替ostream.write的新测试。现在表现好多了。我还更新了测试代码以与不同的文件大小进行比较。

与direct_streambuf相比，提升iostream std::copy_n确实在高容量上挣扎。我希望找到一个更好的替代方案，因为我的应用程序基于ostream而且我负担不起重构。

#include <boost/interprocess/file_mapping.hpp>
#include <boost/interprocess/mapped_region.hpp>
#include <boost/iostreams/device/array.hpp>
#include <boost/iostreams/stream.hpp>
#include <vector>
#include <chrono>
#include <iostream>
#include <fstream>

int test_mapped_file_ostream(size_t TOTAL_SIZE, size_t BLOCK_SIZE, size_t N)
{
    const std::string filename = "test_filemapping.dat";
    boost::interprocess::file_mapping::remove(filename.data());

    {
    std::ofstream file(filename, std::ios::binary | std::ios::trunc);
    file.seekp(static_cast<std::streamoff>(TOTAL_SIZE-1));
    file.write("", 1);
    }

    std::chrono::system_clock::time_point start;
    std::chrono::system_clock::time_point end;
    {
        boost::interprocess::file_mapping fmap(filename.data(), boost::interprocess::read_write);
        boost::interprocess::mapped_region mreg(fmap, boost::interprocess::read_write);
        mreg.advise( boost::interprocess::mapped_region::advice_sequential );

        std::shared_ptr<std::streambuf> buf( new boost::iostreams::stream_buffer<boost::iostreams::array_sink>((char*)(mreg.get_address()), mreg.get_size()));
        std::ostream ostream( buf.get() );

        const std::vector<char> data(BLOCK_SIZE,1);

        start=std::chrono::system_clock::now();     
        for ( size_t i=0; i<N; i++ ) {
            ostream.write( data.data(), data.size() );
        }
        end=std::chrono::system_clock::now();       
    }

    auto total = end-start;
    std::cout << "test_mapped_file_ostream (ms): " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << std::endl;

    return 0;
}

int test_mapped_file_stdcopy_n(size_t TOTAL_SIZE, size_t BLOCK_SIZE, size_t N)
{
    const std::string filename = "test_filemapping_stdcopy.dat";
    boost::interprocess::file_mapping::remove(filename.data());

    {
    std::ofstream file(filename, std::ios::binary | std::ios::trunc);
    file.seekp(static_cast<std::streamoff>(TOTAL_SIZE-1));
    file.write("", 1);
    }

    std::chrono::system_clock::time_point start;
    std::chrono::system_clock::time_point end;
    {
        boost::interprocess::file_mapping fmap(filename.data(), boost::interprocess::read_write);
        boost::interprocess::mapped_region mreg(fmap, boost::interprocess::read_write);
        mreg.advise( boost::interprocess::mapped_region::advice_sequential );

        char* regptr = (char*)mreg.get_address();
        const std::vector<char> data(BLOCK_SIZE,1);

        start=std::chrono::system_clock::now();     
        for ( size_t i=0; i<N; i++ ) {
            std::copy_n( data.data(), data.size(), regptr );
            regptr += data.size();
        }
        end=std::chrono::system_clock::now();       
    }

    auto total = end-start;
    std::cout << "test_mapped_file_stdcopy_n (ms): " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << std::endl;

    return 0;
}

int test_fstream_file(size_t TOTAL_SIZE, size_t BLOCK_SIZE, size_t N)
{
    const std::string filename = "test_fstream.dat";

    std::chrono::system_clock::time_point start;
    std::chrono::system_clock::time_point end;
    {
        const std::vector<char> data(BLOCK_SIZE,1);
        std::ofstream file(filename, std::ios::binary | std::ios::trunc);
        start=std::chrono::system_clock::now();     
        for ( size_t i=0; i<N; i++ ) {
            file.write( data.data(), data.size() );
        }
        end=std::chrono::system_clock::now();       
    }
    auto total = end-start;
    std::cout << "test_fstream_file (ms): " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << std::endl;

    return 0;
}

int main(int argc, char **argv)
{
    if ( argc != 2 ) {
        std::cout << "Usage: " << argv[0] << " <size of output file in gigabytes>" << std::endl;
        exit(1);
    }

    uint64_t totalsize = std::stoull(argv[1]);
    if (totalsize==0) {
        totalsize = 1;
    }

    const std::size_t GB = (uint64_t)1 << 30; 
    const std::size_t TOTAL_SIZE = totalsize << 30; 
    const std::size_t BLOCK_SIZE = (uint64_t)1 << 20;
    const std::size_t N = TOTAL_SIZE/BLOCK_SIZE;

    std::cout << "TOTAL_SIZE (GB)=" << TOTAL_SIZE/GB << std::endl;
    test_mapped_file_ostream(TOTAL_SIZE,BLOCK_SIZE,N);
    test_mapped_file_stdcopy_n(TOTAL_SIZE,BLOCK_SIZE,N);
    test_fstream_file(TOTAL_SIZE,BLOCK_SIZE,N);
    return 0;
}

结果：Windows 7，HHD，64GB RAM

性能比与{ms}中的fstream.write进行比较：

TOTAL_SIZE (GB)=5
test_mapped_file_ostream (ms): 24610 (-1.88x)
test_mapped_file_stdcopy_n (ms): 3307 (3.9x)
test_fstream_file (ms): 13052

TOTAL_SIZE (GB)=10
test_mapped_file_ostream (ms): 49524 (-1.3x)
test_mapped_file_stdcopy_n (ms): 6610 (5.8x)
test_fstream_file (ms): 38219

TOTAL_SIZE (GB)=15
test_mapped_file_ostream (ms): 85041 (1.52x)
test_mapped_file_stdcopy_n (ms): 12387 (10.5x)
test_fstream_file (ms): 129964

TOTAL_SIZE (GB)=20
test_mapped_file_ostream (ms): 122897 (1.7x)
test_mapped_file_stdcopy_n (ms): 17542 (12.2x)
test_fstream_file (ms): 213697

仿形

Answer 1

您仍然使用面向文本的ostream。这将占用于格式化流的大部分时间。

除此之外，考虑进行顺序访问。

最后找到你的瓶颈

我用我所知道的所有技巧来解决这个问题，并提出了以下非常简单的POSIX mmap vs。write比较。

我在适用的情况下使用了madvise和fadvise SEQUENTIAL|WILL_NEED，并确保稀疏度不会导致缓慢。

这一切的简短摘要是：

你的代码真的可以简单得多（参见rev.176f546ea8f65050c）
地图适用于较小的数量
缓冲可能是使得基于流式传输的实现发光而不是基于mmap的实现

<强> Live On Coliru

#include <boost/chrono.hpp>
#include <boost/chrono/chrono_io.hpp>
#include <iostream>
#include <vector>
#include <algorithm>

// mmap the manual way
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#ifndef COLIRU
const std::size_t TOTAL_SIZE = 5ul << 30;
const std::size_t BLOCK_SIZE = 1ul << 20;
#else
const std::size_t TOTAL_SIZE = 1ul << 20;
const std::size_t BLOCK_SIZE = 1ul <<  9;
#endif
static_assert(0 == TOTAL_SIZE%BLOCK_SIZE, "not divisable by block size");
const int N = TOTAL_SIZE/BLOCK_SIZE;

template <typename Caption, typename F>
auto timed(Caption const& task, F&& f) {
    using namespace boost::chrono;
    struct _ {
        high_resolution_clock::time_point s;
        Caption const& task;
        ~_() { std::cout << " -- (" << task << " completed in " << duration_cast<milliseconds>(high_resolution_clock::now() - s) << ")\n"; }
    } timing { high_resolution_clock::now(), task };

    return f();
}

void test_mapped_file() {
    std::vector<char> const data(BLOCK_SIZE, 1);
    const std::string filename = "test_filemapping.dat";

    std::remove(filename.c_str());

    int fd = open(filename.c_str(), O_RDWR|O_CREAT, 0644);

    if (fd==-1) {
        perror("open");
        exit(255);
    }

    if(posix_fallocate64(fd, 0, TOTAL_SIZE)) {
        perror("fallocate64");
        exit(255);
    }

    posix_fadvise64(fd, 0, TOTAL_SIZE, POSIX_FADV_WILLNEED | POSIX_FADV_SEQUENTIAL);

    char* fmap = static_cast<char*>(mmap64(nullptr, TOTAL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0));

    if (!fmap || fmap == MAP_FAILED) {
        perror("mmap");
        exit(255);
    }

    madvise(fmap, TOTAL_SIZE, MADV_SEQUENTIAL | MADV_WILLNEED);

    timed(filename, [output=fmap, &data] () mutable {
        for (size_t i = 0; i < N; i++) {
            std::copy_n(data.data(), data.size(), output);
            output += data.size();
        }
    });

    munmap(fmap, TOTAL_SIZE);
    close(fd);
}

void test_posix_write() {
    std::vector<char> const data(BLOCK_SIZE, 1);
    const std::string filename = "test_posix.dat";

    std::remove(filename.c_str());

    int fd = open(filename.c_str(), O_RDWR|O_CREAT, 0644);

    if (fd==-1) {
        perror("open");
        exit(255);
    }

    posix_fadvise64(fd, 0, TOTAL_SIZE, POSIX_FADV_WILLNEED | POSIX_FADV_SEQUENTIAL);

    timed(filename, [&] () mutable {
        for (size_t i = 0; i < N; i++) {
            ptrdiff_t count = ::write(fd, data.data(), data.size());
            if (-1 == count) { 
                perror("write");
                exit(255);
            }
            assert(count == BLOCK_SIZE);
        }
    });

    close(fd);
}

int main() {
    test_mapped_file();
    test_posix_write();
}

在Coliru印刷品上进行测试时：

./a.out; md5sum *.dat
 -- (test_filemapping.dat completed in 0 milliseconds)
 -- (test_posix.dat completed in 8 milliseconds)
d35bb2e58b602d94ccd9628f249ae7e5  test_filemapping.dat
d35bb2e58b602d94ccd9628f249ae7e5  test_posix.dat

本地运行（5GiB卷）：

$ ./test
 -- (test_filemapping.dat completed in 1950 milliseconds)
 -- (test_posix.dat completed in 1307 milliseconds)

Answer 2

您可能遇到thrashing，这将显着减慢通过内存映射写入文件所需的时间。您的基准测试会写出近5千兆字节的数据。如果你没有5千兆字节的RAM可用，那么操作系统将忙于在内存中交换脏的页面以获取磁盘上的数据。

作为sehe suggested，您可能会考虑使用madvising，因为在这种情况下，您将按顺序访问内存映射文件：

mreg.advise( boost::interprocess::mapped_region::advice_sequential );

然而，请注意，这不会解决捶打问题。

提升file_mapping性能

2 个答案: