我目前正在尝试使用C ++应用程序将插入的写入速度最大化到MongoDB中。我看到的行为是insert_many()操作将减慢,导致写入队列建立,然后后续的insert_many()操作现在有更多要插入。我做了一个小样本应用程序来演示这个问题。示例程序定义为两个线程:
struct CountData {
CountData(const size_t p_index, const std::string& p_word, const size_t p_count)
: index(p_index)
, word(p_word)
, count(p_count)
{
}
const size_t index = 0;
const std::string word;
const int32_t count = 0;
};
struct CollectionData {
CollectionData(const std::string& collectionName) : name(collectionName) {
options.ordered(false);
auto writeConcern = mongocxx::write_concern{};
writeConcern.acknowledge_level(mongocxx::write_concern::level::k_unacknowledged);
options.write_concern(writeConcern);
}
void push_back(const bsoncxx::document::value& value) { documents.push_back(value); }
size_t size() const { return documents.size(); }
void writeAll(mongocxx::pool& pool) {
auto client = pool.acquire();
auto collection = (*client)["frequency"][name];
collection.insert_many(documents, options);
}
void clear() { documents.clear(); }
private:
const std::string name;
mongocxx::options::insert options;
std::vector<bsoncxx::document::value> documents;
};
class FrequencyCounter {
public:
FrequencyCounter(const std::string& mongoUri, const std::string& dictionaryFile)
: _collectionNames({ "A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "M", "N", "O", "P", "Q", "R",
"S", "T", "U", "V", "W", "X", "Y", "Z" })
, _mongoPool(mongocxx::uri(mongoUri))
, _dictionary(dictionaryFile)
{
for(const auto& name : _collectionNames) {
_collections.push_back(name);
}
_thread = std::thread(&FrequencyCounter::workerThread, this);
}
~FrequencyCounter() {
_isRunning = false;
_event.notify_one();
_thread.join();
}
void Run() {
std::ifstream inFile(_dictionary);
if(!inFile.is_open()) {
std::cerr << "Could not open definition file: " << _dictionary << std::endl;
std::exit(-1);
}
std::string line;
while(std::getline(inFile, line)) {
std::string word = line;
std::transform(word.begin(), word.end(), word.begin(), ::toupper);
size_t index = 0;
for(const auto& letter : _collectionNames) {
size_t count = std::count(word.begin(), word.end(), letter[0]);
if(count > 0)
_dataQueue.addPending(CountData(index, word, count));
++index;
}
_event.notify_one();
}
}
private:
void writeData(const bool flush=false) {
if(!_dataQueue.trySwap())
return; // No data to write
const auto& dataQueue = _dataQueue.active();
for(const auto& data : dataQueue) {
const uint64_t begin = DateTime::now();
auto doc = bsoncxx::builder::basic::document{};
doc.append(bsoncxx::builder::basic::kvp("word", data.word));
doc.append(bsoncxx::builder::basic::kvp("count", data.count));
_collections[data.index].push_back(doc.extract());
const uint64_t end = DateTime::now();
_docCreationTimes.emplace_back(end - begin);
}
for(auto& collection : _collections) {
const size_t currentSize = collection.size();
if(flush || currentSize >= _maxDocQueueSize) {
const uint64_t begin = DateTime::now();
collection.writeAll(_mongoPool);
const uint64_t end = DateTime::now();
_docInsertionTimes.emplace_back(end - begin);
collection.clear();
}
}
}
void workerThread() {
try {
while(_isRunning) {
_event.wait();
_event.reset();
writeData();
}
const bool flush = true;
writeData(flush);
} catch(const std::exception& ex) {
std::cerr << "Exception in thread: " << ex.what();
}
_isRunning = false;
{
uint64_t minTime = std::numeric_limits<uint64_t>::max();
uint64_t maxTime = 0;
uint64_t sumTime = 0;
uint64_t count = 0;
for(const auto& time : _docCreationTimes) {
if(time < minTime)
minTime = time;
if(time > maxTime)
maxTime = time;
sumTime += time;
++count;
}
std::cout << "Doc Creation Time (avg): " << lPadd(std::to_string(sumTime / count), '0', 12) << "ns" << std::endl;
std::cout << "Doc Creation Time (min): " << lPadd(std::to_string(minTime), '0', 12) << "ns" << std::endl;
std::cout << "Doc Creation Time (max): " << lPadd(std::to_string(maxTime), '0', 12) << "ns" << std::endl;
}
{
uint64_t minTime = std::numeric_limits<uint64_t>::max();
uint64_t maxTime = 0;
uint64_t sumTime = 0;
uint64_t count = 0;
for(const auto& time : _docInsertionTimes) {
if(time < minTime)
minTime = time;
if(time > maxTime)
maxTime = time;
sumTime += time;
++count;
}
std::cout << "Doc Insertion Time (avg): " << lPadd(std::to_string(sumTime / count), '0', 12) << "ns" << std::endl;
std::cout << "Doc Insertion Time (min): " << lPadd(std::to_string(minTime), '0', 12) << "ns" << std::endl;
std::cout << "Doc Insertion Time (max): " << lPadd(std::to_string(maxTime), '0', 12) << "ns" << std::endl;
}
}
const size_t _maxDocQueueSize = 10;
const std::vector<std::string> _collectionNames;
mongocxx::instance _mongoInstance;
mongocxx::pool _mongoPool;
std::string _dictionary;
std::vector<CollectionData> _collections;
AtomicVector<CountData> _dataQueue; // thread-safe double buffer
std::vector<uint64_t> _docCreationTimes;
std::vector<uint64_t> _docInsertionTimes;
Event _event;
volatile bool _isRunning = true;
std::thread _thread;
};
int main(int argc, char* argv[]) {
const std::string mongoUri = "mongodb://localhost:27017/?minPoolSize=50&maxPoolSize=50";
const std::string dictionary = "words_alpha.txt";
FrequencyCounter counter(mongoUri, dictionary);
counter.Run();
return 0;
}
结果:
Doc Creation Time (avg): 000,000,000,837ns
Doc Creation Time (min): 000,000,000,556ns
Doc Creation Time (max): 000,015,521,675ns
Doc Insertion Time (avg): 000,087,038,560ns
Doc Insertion Time (min): 000,000,023,311ns
Doc Insertion Time (max): 005,407,689,435ns
我尝试了以下改动而没有运气:
是否有任何优化或更改可以让我拥有一个能够跟上主线程高吞吐量的工作线程?
答案 0 :(得分:0)
我意识到这是一个相对较旧的问题,但是您可能会看到使用collection.bulk_write(bulk_write &bulk_write)将记录插入工作线程中可以提高性能。
通过将一系列操作(mongocxx::model::insert_one
,mongocxx::model::delete_one
等)附加到mongocxx::bulk_write
(class reference docs)的实例,然后执行一批准备好的操作来创建这些文件使用collection.bulk_write(bulk_write)
。
Some nice examples can be found here
为了比较性能,
测试1:
Inserted 100000 in 27263651us insert_one
Inserted 100000 in 1129957us insert_many
Inserted 100000 in 916561us insert_bulk
测试2:
Inserted 100000 in 28196463us insert_one
Inserted 100000 in 1089758us insert_many
Inserted 100000 in 967773us insert_bulk
这些数字是使用下面的代码段获得的(请注意,mongocxx驱动程序v3.0.3,MongoDB v3.2):
struct msg {
long num;
long size;
long time;
}
//using insert_one()
void store_msg_one(std::vector<msg> lst)
{
for(int i = 0; i < lst.size(); i++)
{
msg cur_msg = lst[i];
bsoncxx::builder::stream::document msg_info_builder{};
msg_info_builder << "msg_num" << cur_msg.num
<< "msg_size" << cur_msg.size
<< "msg_time" << cur_msg.time;
bsoncxx::document::value doc_val = msg_info_builder << bson::builder::stream::finalize;
collection.insert_one(doc_val.view());
}
}
//using insert_many()
void store_msg_many(std::vector<msg> lst)
{
std::vector<bsoncxx::document::value> lst2;
for(int i = 0; i < lst.size(); i++)
{
msg cur_msg = lst[i];
bsoncxx::builder::stream::document msg_info_builder{};
msg_info_builder << "msg_num" << cur_msg.num
<< "msg_size" << cur_msg.size
<< "msg_time" << cur_msg.time;
bsoncxx::document::value doc_val = msg_info_builder << bson::builder::stream::finalize;
lst2.push_back(doc_val);
}
collection.insert_many(lst2);
}
//using bulk_write()
void store_msg_bulk(std::vector<msg> lst)
{
mongocxx::options::bulk_write bulk_opt;
mongocxx::write_concern wc;
bulk_opt.ordered(false); //see https://docs.mongodb.com/manual/core/bulk-write-operations/
wc.acknowledge_level(mongocxx::write_concern::level::k_default);
bulk_opt.write_concern(wc);
mongocxx::bulk_write bulk = mongocxx::bulk_write{bulk_opt};
for(int i = 0; i < list.size(); i++)
{
msg cur_msg = lst[i];
bsoncxx::builder::stream::document msg_info_builder{};
msg_info_builder << "msg_num" << cur_msg.num
<< "msg_size" << cur_msg.size
<< "msg_time" << cur_msg.time;
bsoncxx::document::value doc_val = msg_info_insert << bsoncxx::builder::stream::finalize;
mongocxx::model::insert_one msg_info_insert_op{doc_val.view()};
bulk.append(msg_info_insert_op);
}
collection.bulk_write(bulk);
}
void main()
{
std::vector<msg> lst;
int num_msg = 100000;
for(int i = 0; i < num_msg; i++)
{
msg info;
info.time = 20*i;
info.num = i;
info.size = sizeof(i);
lst.push_back(info);
}
//Test with insert_one(...)
long long start_microsecs = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
store_msg_one(lst);
long long end_microsecs = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
std::cout << "Inserted " << num_msg << " in " << end_microsecs - start_microsecs << "us" << " insert_one(...)" << std::endl;
//Test with insert_many(...)
start_microsecs = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
store_msg_many(lst);
end_microsecs = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
std::cout << "Inserted " << num_msg << " in " << end_microsecs - start_microsecs << "us" << " insert_one(...)" << std::endl;
//Test with bulk_write(...)
start_microsecs = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
store_msg_bulk(lst);
end_microsecs = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
std::cout << "Inserted " << num_msg << " in " << end_microsecs - start_microsecs << "us" << " insert_bulk(...)" << std::endl;
std::cin.ignore();
}
注意:有关bulk_write选项的更多信息,请参见MongoDB docs
编辑:格式化