我写了一个小测试来比较boost file_mapping
和std::ofstream
之间的文件写入操作。我的印象是file_mapping性能会更好,但显然并非如此。
有人能解释为什么我会用std::ofstream
获得更好的数字吗?
[编辑]:所以我做了基准测试的分析,发现boost::iostreams::detail::direct_streambuf
花了很多时间复制字节。我添加了一个使用std::copy_n
代替ostream.write
的新测试。现在表现好多了。我还更新了测试代码以与不同的文件大小进行比较。
与direct_streambuf
相比,提升iostream std::copy_n
确实在高容量上挣扎。我希望找到一个更好的替代方案,因为我的应用程序基于ostream而且我负担不起重构。
#include <boost/interprocess/file_mapping.hpp>
#include <boost/interprocess/mapped_region.hpp>
#include <boost/iostreams/device/array.hpp>
#include <boost/iostreams/stream.hpp>
#include <vector>
#include <chrono>
#include <iostream>
#include <fstream>
int test_mapped_file_ostream(size_t TOTAL_SIZE, size_t BLOCK_SIZE, size_t N)
{
const std::string filename = "test_filemapping.dat";
boost::interprocess::file_mapping::remove(filename.data());
{
std::ofstream file(filename, std::ios::binary | std::ios::trunc);
file.seekp(static_cast<std::streamoff>(TOTAL_SIZE-1));
file.write("", 1);
}
std::chrono::system_clock::time_point start;
std::chrono::system_clock::time_point end;
{
boost::interprocess::file_mapping fmap(filename.data(), boost::interprocess::read_write);
boost::interprocess::mapped_region mreg(fmap, boost::interprocess::read_write);
mreg.advise( boost::interprocess::mapped_region::advice_sequential );
std::shared_ptr<std::streambuf> buf( new boost::iostreams::stream_buffer<boost::iostreams::array_sink>((char*)(mreg.get_address()), mreg.get_size()));
std::ostream ostream( buf.get() );
const std::vector<char> data(BLOCK_SIZE,1);
start=std::chrono::system_clock::now();
for ( size_t i=0; i<N; i++ ) {
ostream.write( data.data(), data.size() );
}
end=std::chrono::system_clock::now();
}
auto total = end-start;
std::cout << "test_mapped_file_ostream (ms): " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << std::endl;
return 0;
}
int test_mapped_file_stdcopy_n(size_t TOTAL_SIZE, size_t BLOCK_SIZE, size_t N)
{
const std::string filename = "test_filemapping_stdcopy.dat";
boost::interprocess::file_mapping::remove(filename.data());
{
std::ofstream file(filename, std::ios::binary | std::ios::trunc);
file.seekp(static_cast<std::streamoff>(TOTAL_SIZE-1));
file.write("", 1);
}
std::chrono::system_clock::time_point start;
std::chrono::system_clock::time_point end;
{
boost::interprocess::file_mapping fmap(filename.data(), boost::interprocess::read_write);
boost::interprocess::mapped_region mreg(fmap, boost::interprocess::read_write);
mreg.advise( boost::interprocess::mapped_region::advice_sequential );
char* regptr = (char*)mreg.get_address();
const std::vector<char> data(BLOCK_SIZE,1);
start=std::chrono::system_clock::now();
for ( size_t i=0; i<N; i++ ) {
std::copy_n( data.data(), data.size(), regptr );
regptr += data.size();
}
end=std::chrono::system_clock::now();
}
auto total = end-start;
std::cout << "test_mapped_file_stdcopy_n (ms): " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << std::endl;
return 0;
}
int test_fstream_file(size_t TOTAL_SIZE, size_t BLOCK_SIZE, size_t N)
{
const std::string filename = "test_fstream.dat";
std::chrono::system_clock::time_point start;
std::chrono::system_clock::time_point end;
{
const std::vector<char> data(BLOCK_SIZE,1);
std::ofstream file(filename, std::ios::binary | std::ios::trunc);
start=std::chrono::system_clock::now();
for ( size_t i=0; i<N; i++ ) {
file.write( data.data(), data.size() );
}
end=std::chrono::system_clock::now();
}
auto total = end-start;
std::cout << "test_fstream_file (ms): " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << std::endl;
return 0;
}
int main(int argc, char **argv)
{
if ( argc != 2 ) {
std::cout << "Usage: " << argv[0] << " <size of output file in gigabytes>" << std::endl;
exit(1);
}
uint64_t totalsize = std::stoull(argv[1]);
if (totalsize==0) {
totalsize = 1;
}
const std::size_t GB = (uint64_t)1 << 30;
const std::size_t TOTAL_SIZE = totalsize << 30;
const std::size_t BLOCK_SIZE = (uint64_t)1 << 20;
const std::size_t N = TOTAL_SIZE/BLOCK_SIZE;
std::cout << "TOTAL_SIZE (GB)=" << TOTAL_SIZE/GB << std::endl;
test_mapped_file_ostream(TOTAL_SIZE,BLOCK_SIZE,N);
test_mapped_file_stdcopy_n(TOTAL_SIZE,BLOCK_SIZE,N);
test_fstream_file(TOTAL_SIZE,BLOCK_SIZE,N);
return 0;
}
结果:Windows 7,HHD,64GB RAM
性能比与{ms}中的fstream.write
进行比较:
TOTAL_SIZE (GB)=5
test_mapped_file_ostream (ms): 24610 (-1.88x)
test_mapped_file_stdcopy_n (ms): 3307 (3.9x)
test_fstream_file (ms): 13052
TOTAL_SIZE (GB)=10
test_mapped_file_ostream (ms): 49524 (-1.3x)
test_mapped_file_stdcopy_n (ms): 6610 (5.8x)
test_fstream_file (ms): 38219
TOTAL_SIZE (GB)=15
test_mapped_file_ostream (ms): 85041 (1.52x)
test_mapped_file_stdcopy_n (ms): 12387 (10.5x)
test_fstream_file (ms): 129964
TOTAL_SIZE (GB)=20
test_mapped_file_ostream (ms): 122897 (1.7x)
test_mapped_file_stdcopy_n (ms): 17542 (12.2x)
test_fstream_file (ms): 213697
仿形
答案 0 :(得分:4)
您仍然使用面向文本的ostream
。这将占用于格式化流的大部分时间。
除此之外,考虑进行顺序访问。
最后找到你的瓶颈
我用我所知道的所有技巧来解决这个问题,并提出了以下非常简单的POSIX mmap
vs。write
比较。
我在适用的情况下使用了madvise
和fadvise
SEQUENTIAL|WILL_NEED
,并确保稀疏度不会导致缓慢。
这一切的简短摘要是:
<强> Live On Coliru 强>
#include <boost/chrono.hpp>
#include <boost/chrono/chrono_io.hpp>
#include <iostream>
#include <vector>
#include <algorithm>
// mmap the manual way
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#ifndef COLIRU
const std::size_t TOTAL_SIZE = 5ul << 30;
const std::size_t BLOCK_SIZE = 1ul << 20;
#else
const std::size_t TOTAL_SIZE = 1ul << 20;
const std::size_t BLOCK_SIZE = 1ul << 9;
#endif
static_assert(0 == TOTAL_SIZE%BLOCK_SIZE, "not divisable by block size");
const int N = TOTAL_SIZE/BLOCK_SIZE;
template <typename Caption, typename F>
auto timed(Caption const& task, F&& f) {
using namespace boost::chrono;
struct _ {
high_resolution_clock::time_point s;
Caption const& task;
~_() { std::cout << " -- (" << task << " completed in " << duration_cast<milliseconds>(high_resolution_clock::now() - s) << ")\n"; }
} timing { high_resolution_clock::now(), task };
return f();
}
void test_mapped_file() {
std::vector<char> const data(BLOCK_SIZE, 1);
const std::string filename = "test_filemapping.dat";
std::remove(filename.c_str());
int fd = open(filename.c_str(), O_RDWR|O_CREAT, 0644);
if (fd==-1) {
perror("open");
exit(255);
}
if(posix_fallocate64(fd, 0, TOTAL_SIZE)) {
perror("fallocate64");
exit(255);
}
posix_fadvise64(fd, 0, TOTAL_SIZE, POSIX_FADV_WILLNEED | POSIX_FADV_SEQUENTIAL);
char* fmap = static_cast<char*>(mmap64(nullptr, TOTAL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0));
if (!fmap || fmap == MAP_FAILED) {
perror("mmap");
exit(255);
}
madvise(fmap, TOTAL_SIZE, MADV_SEQUENTIAL | MADV_WILLNEED);
timed(filename, [output=fmap, &data] () mutable {
for (size_t i = 0; i < N; i++) {
std::copy_n(data.data(), data.size(), output);
output += data.size();
}
});
munmap(fmap, TOTAL_SIZE);
close(fd);
}
void test_posix_write() {
std::vector<char> const data(BLOCK_SIZE, 1);
const std::string filename = "test_posix.dat";
std::remove(filename.c_str());
int fd = open(filename.c_str(), O_RDWR|O_CREAT, 0644);
if (fd==-1) {
perror("open");
exit(255);
}
posix_fadvise64(fd, 0, TOTAL_SIZE, POSIX_FADV_WILLNEED | POSIX_FADV_SEQUENTIAL);
timed(filename, [&] () mutable {
for (size_t i = 0; i < N; i++) {
ptrdiff_t count = ::write(fd, data.data(), data.size());
if (-1 == count) {
perror("write");
exit(255);
}
assert(count == BLOCK_SIZE);
}
});
close(fd);
}
int main() {
test_mapped_file();
test_posix_write();
}
在Coliru印刷品上进行测试时:
./a.out; md5sum *.dat
-- (test_filemapping.dat completed in 0 milliseconds)
-- (test_posix.dat completed in 8 milliseconds)
d35bb2e58b602d94ccd9628f249ae7e5 test_filemapping.dat
d35bb2e58b602d94ccd9628f249ae7e5 test_posix.dat
本地运行(5GiB卷):
$ ./test
-- (test_filemapping.dat completed in 1950 milliseconds)
-- (test_posix.dat completed in 1307 milliseconds)
答案 1 :(得分:2)
您可能遇到thrashing,这将显着减慢通过内存映射写入文件所需的时间。您的基准测试会写出近5千兆字节的数据。如果你没有5千兆字节的RAM可用,那么操作系统将忙于在内存中交换脏的页面以获取磁盘上的数据。
作为sehe suggested,您可能会考虑使用madvising,因为在这种情况下,您将按顺序访问内存映射文件:
mreg.advise( boost::interprocess::mapped_region::advice_sequential );
然而,请注意,这不会解决捶打问题。