我有一小组数据样本 - 九个文件,每个文件1,000,000行。真实数据全部实时捕获,然后插入MySQL数据库。我第一次尝试这个时,插入总数据集只需要超过16个小时(约5亿行,24GB纯文本)。我觉得这太慢了,所以试着加快速度。
为此,我已经有了一个功能齐全的多线程实现程序,我已经工作了一段时间。对于每个文件,它将创建一个表,使用适当的SQL创建一个临时文件,然后对每个文件运行LOAD DATA LOCAL INFILE
。
相关代码如下:
void insert_data(uint32_t tid)
{
string scribe_file;
file_list_mutex.lock();
while(scribe_files.size() > 0)
{
scribe_file = *(scribe_files.begin());
scribe_files.erase(scribe_files.begin());
file_list_mutex.unlock();
MYSQL *thread_con = mysql_init(nullptr);
if(thread_con == nullptr)
{
log_mutex.lock();
cerr << "Thead " << tid << ": " << mysql_error(thread_con) << endl;
log_mutex.unlock();
return;
}
if(nullptr == (mysql_real_connect(thread_con, server.c_str(), user.c_str(),
password.c_str(), nullptr,
0, nullptr, 0)))
{
log_mutex.lock();
cerr << "Thead " << tid << ": " << mysql_error(thread_con) << endl;
log_mutex.unlock();
mysql_close(thread_con);
return;
}
if(mysql_select_db(thread_con, database.c_str()))
{
log_mutex.lock();
cerr << "Thead " << tid << ": " << mysql_error(thread_con) << endl;
log_mutex.unlock();
mysql_close(thread_con);
return;
}
string table_name = get_table_name(scribe_file);
if(table_name.empty())
{
log_mutex.lock();
cerr << "Thead " << tid << ": Unusuable input file: " << scribe_file << endl;
log_mutex.unlock();
return;
}
ifstream scribe_stream(scribe_file);
if(!scribe_stream.good())
{
log_mutex.lock();
cerr << "Thead " << tid << ": Error opening " << scribe_file << endl;
log_mutex.unlock();
return;
}
string output_filename = "/tmp/";
{
vector<string> file_path = split_string(scribe_file, '/');
output_filename.append(file_path.rbegin()[0]);
}
output_filename.append(".sql");
ofstream output;
output.open(output_filename, ios::out | ios::trunc);
if(!output.good())
{
log_mutex.lock();
cerr << "Thead " << tid << ": Error opening " << output_filename << endl;
log_mutex.unlock();
scribe_stream.close();
return;
}
string create_query = "CREATE TABLE IF NOT EXISTS ";
string table_format = " (IDpk INT NOT NULL auto_increment,"
" s CHAR(8) NOT NULL,"
" si INT unsigned NOT NULL,"
" pq TINYINT unsigned NOT NULL,"
" pr BIGINT unsigned DEFAULT NULL,"
" sz INT DEFAULT NULL,"
" io TINYINT unsigned DEFAULT NULL,"
" ipslt TINYINT unsigned DEFAULT NULL,"
" is TINYINT unsigned DEFAULT NULL,"
" ilp BIGINT unsigned DEFAULT NULL,"
" ips INT unsigned DEFAULT NULL,"
" at INT unsigned DEFAULT NULL,"
" vn TINYINT unsigned NOT NULL,"
" ms BIGINT unsigned NOT NULL,"
" us BIGINT unsigned NOT NULL,"
" PRIMARY KEY(IDpk),"
" KEY(us),"
" KEY(s),"
" KEY(pq))";
create_query.append(table_name);
create_query.append(table_format);
if(mysql_query(thread_con, create_query.c_str()))
{
log_mutex.lock();
cerr << "Thead " << tid << ": " << mysql_error(thread_con) << endl;
log_mutex.unlock();
scribe_stream.close();
output.close();
mysql_close(thread_con);
return;
}
string scribe_stream_line;
char values[MAX_TOKENS * MAX_TOKEN_LENGTH];
usec_t start_time = get_current_us_time();
uint64_t num_lines = 0;
log_mutex.lock();
cout << "Thread " << tid << ": Starting " << scribe_file << endl;
log_mutex.unlock();
char scribe_tokens[MAX_TOKENS][MAX_TOKEN_LENGTH];
while(getline(scribe_stream, scribe_stream_line))
{
split_scribe_line(scribe_stream_line, scribe_tokens);
num_lines++;
if(scribe_tokens[6][0] != '\0')
{
try
{
uint32_t item_type = stoi(scribe_tokens[2]);
fill(values, values + (MAX_TOKENS * MAX_TOKEN_LENGTH), '\0');
switch(item_type)
{
case 0:
case 1:
{
sprintf(values,
"%s,%s,%s,"
"%s,%s,"
"NULL,NULL,NULL,NULL,NULL,NULL,"
"%s,%s,%s",
scribe_tokens[0], scribe_tokens[1], scribe_tokens[2],
scribe_tokens[3], scribe_tokens[4], scribe_tokens[5],
scribe_tokens[6], scribe_tokens[7]);
break;
}
case 2:
{
sprintf(values,
"%s,%s,%s,"
"%s,%s,%s,%s,"
"NULL,NULL,NULL,NULL,"
"%s,%s,%s",
scribe_tokens[0], scribe_tokens[1], scribe_tokens[2],
scribe_tokens[3], scribe_tokens[4], scribe_tokens[5],
scribe_tokens[6], scribe_tokens[7], scribe_tokens[8],
scribe_tokens[9]);
break;
}
case 3:
{
sprintf(values,
"%s,%s,%s,"
"NULL,NULL,NULL,NULL,"
"NULL,NULL,NULL,NULL,"
"%s,%s,%s",
scribe_tokens[0], scribe_tokens[1], scribe_tokens[2],
scribe_tokens[3], scribe_tokens[4], scribe_tokens[5]);
break;
}
case 4:
{
sprintf(values,
"%s,%s,%s,"
"%s,%s,NULL,NULL,"
"%s,%s,%s,NULL,"
"%s,%s,%s",
scribe_tokens[0], scribe_tokens[1], scribe_tokens[2],
scribe_tokens[3], scribe_tokens[4], scribe_tokens[5],
scribe_tokens[6], scribe_tokens[7], scribe_tokens[8],
scribe_tokens[9], scribe_tokens[10]);
break;
}
case 5:
{
sprintf(values,
"%s,%s,%s,"
"NULL,NULL,NULL,NULL,NULL,NULL,NULL,"
"%s,%s,%s,%s",
scribe_tokens[0], scribe_tokens[1], scribe_tokens[2],
scribe_tokens[3], scribe_tokens[4], scribe_tokens[5],
scribe_tokens[6]);
break;
}
default:
log_mutex.lock();
cerr << "Thread " << tid << ": Unknown pq type " << item_type << "\n"
<< " " << scribe_stream_line << endl;
log_mutex.lock();
continue;
break;
}
output << values << endl;
}
catch(exception &ex)
{
log_mutex.lock();
cerr << "Thread " << tid << ": Error parsing scribe line\n '"
<< scribe_stream_line << "'\n" << " " << ex.what() << endl;
log_mutex.unlock();
scribe_stream.close();
output.close();
throw;
}
}
}
log_mutex.lock();
cout << "Thread " << tid << ": preparing " << num_lines << " lines took "
<< us_to_hhmmss(get_current_us_time() - start_time) << endl;
log_mutex.unlock();
string insert_query = "LOAD DATA LOCAL INFILE ";
insert_query.append("'" + output_filename + "' INTO TABLE "
+ table_name + " FIELDS TERMINATED BY ','");
log_mutex.lock();
cout << "Thread " << tid << ": Loading results into database." << endl;
log_mutex.unlock();
start_time = get_current_us_time();
if(mysql_query(thread_con, insert_query.c_str()))
{
log_mutex.lock();
cerr << "Thead " << tid << ": " <<mysql_error(thread_con) << endl;
log_mutex.unlock();
scribe_stream.close();
output.close();
mysql_close(thread_con);
return;
}
mysql_close(thread_con);
log_mutex.lock();
cout << "Thread " << tid << ": " << "Insertion took "
<< us_to_hhmmss(get_current_us_time() - start_time) << endl;
log_mutex.unlock();
scribe_stream.close();
output.close();
remove(output_filename.c_str());
}
file_list_mutex.unlock();
}
真正让我缺乏理解的是性能不一致。例如,当我用两个带有一个和两个线程的文件运行它时,性能大致相当:
Scribe DB CLI V1.07
Got 8 CPU cores.
Parsing 2 input files.
Thread 0: Starting scribe_20170511/short_scribe_1.data
Thread 1: Starting scribe_20170511/short_scribe_0.data
Thread 0: preparing 1000000 lines took 00:00:02
Thread 0: Loading results into database.
Thread 1: preparing 1000000 lines took 00:00:02
Thread 1: Loading results into database.
Thread 1: Insertion took 00:00:35
Thread 0: Insertion took 00:00:37
Total runtime: 00:00:40
Scribe DB CLI V1.07
Got 1 CPU cores.
Parsing 2 input files.
Thread 0: Starting scribe_20170511/short_scribe_0.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:19
Thread 0: Starting scribe_20170511/short_scribe_1.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:20
Total runtime: 00:00:43
每次运行此测试时,我都会删除数据库。
同样,当在所有9个输入文件上运行时,我的结果各不相同,但没有任何结果显示正确的加速:
Scribe DB CLI V1.07
Got 1 CPU cores.
Parsing 9 input files.
Thread 0: Starting scribe_20170511/short_scribe_0.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:19
Thread 0: Starting scribe_20170511/short_scribe_1.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:20
Thread 0: Starting scribe_20170511/short_scribe_2.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:20
Thread 0: Starting scribe_20170511/short_scribe_3.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:21
Thread 0: Starting scribe_20170511/short_scribe_4.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:21
Thread 0: Starting scribe_20170511/short_scribe_5.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:21
Thread 0: Starting scribe_20170511/short_scribe_6.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:21
Thread 0: Starting scribe_20170511/short_scribe_7.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:21
Thread 0: Starting scribe_20170511/short_scribe_8.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 0: Insertion took 00:00:21
Total runtime: 00:03:27
Scribe DB CLI V1.07
Got 8 CPU cores.
Parsing 9 input files.
Thread 3: Starting scribe_20170511/short_scribe_3.data
Thread 1: Starting scribe_20170511/short_scribe_4.data
Thread 2: Starting scribe_20170511/short_scribe_1.data
Thread 0: Starting scribe_20170511/short_scribe_0.data
Thread 4: Starting scribe_20170511/short_scribe_2.data
Thread 5: Starting scribe_20170511/short_scribe_5.data
Thread 3: preparing 1000000 lines took 00:00:03
Thread 3: Loading results into database.
Thread 0: preparing 1000000 lines took 00:00:03
Thread 0: Loading results into database.
Thread 2: preparing 1000000 lines took 00:00:03
Thread 2: Loading results into database.
Thread 4: preparing 1000000 lines took 00:00:03
Thread 4: Loading results into database.
Thread 1: preparing 1000000 lines took 00:00:03
Thread 1: Loading results into database.
Thread 5: preparing 1000000 lines took 00:00:03
Thread 5: Loading results into database.
Thread 0: Insertion took 00:02:20
Thread 0: Starting scribe_20170511/short_scribe_6.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 1: Insertion took 00:02:44
Thread 1: Starting scribe_20170511/short_scribe_7.data
Thread 1: preparing 1000000 lines took 00:00:01
Thread 1: Loading results into database.
Thread 5: Insertion took 00:03:01
Thread 5: Starting scribe_20170511/short_scribe_8.data
Thread 5: preparing 1000000 lines took 00:00:01
Thread 5: Loading results into database.
Thread 3: Insertion took 00:03:07
Thread 4: Insertion took 00:03:10
Thread 2: Insertion took 00:03:14
Thread 0: Insertion took 00:01:37
Thread 1: Insertion took 00:01:20
Thread 5: Insertion took 00:01:06
Total runtime: 00:04:14
Scribe DB CLI V1.08
Got 8 CPU cores.
Parsing 9 input files.
Thread 5: Starting scribe_20170511/short_scribe_4.data
Thread 1: Starting scribe_20170511/short_scribe_1.data
Thread 3: Starting scribe_20170511/short_scribe_2.data
Thread 4: Starting scribe_20170511/short_scribe_5.data
Thread 0: Starting scribe_20170511/short_scribe_0.data
Thread 2: Starting scribe_20170511/short_scribe_3.data
Thread 5: preparing 1000000 lines took 00:00:03
Thread 5: Loading results into database.
Thread 4: preparing 1000000 lines took 00:00:03
Thread 4: Loading results into database.
Thread 1: preparing 1000000 lines took 00:00:03
Thread 1: Loading results into database.
Thread 3: preparing 1000000 lines took 00:00:03
Thread 3: Loading results into database.
Thread 0: preparing 1000000 lines took 00:00:03
Thread 0: Loading results into database.
Thread 2: preparing 1000000 lines took 00:00:03
Thread 2: Loading results into database.
Thread 0: Insertion took 00:01:43
Thread 0: Starting scribe_20170511/short_scribe_6.data
Thread 0: preparing 1000000 lines took 00:00:01
Thread 0: Loading results into database.
Thread 5: Insertion took 00:02:00
Thread 5: Starting scribe_20170511/short_scribe_7.data
Thread 5: preparing 1000000 lines took 00:00:01
Thread 5: Loading results into database.
Thread 2: Insertion took 00:02:02
Thread 2: Starting scribe_20170511/short_scribe_8.data
Thread 4: Insertion took 00:02:04
Thread 3: Insertion took 00:02:04
Thread 2: preparing 1000000 lines took 00:00:01
Thread 2: Loading results into database.
Thread 1: Insertion took 00:02:06
Thread 0: Insertion took 00:00:59
Thread 5: Insertion took 00:00:49
Thread 2: Insertion took 00:00:48
Total runtime: 00:02:57
我在这看什么?在查询期间,我的8核心上的CPU利用率为10-50%,而我的SSD上的磁盘利用率为15-40mb / s。我确信在幕后有一些复杂的缓存和其他如此奇特的工作可以解释这种不一致性,但我真的希望单独的表可以产生更好的性能提升。
提前致谢。 =)
添加Rick James的详细信息:
mysql> SHOW VARIABLES LIKE '%buffer%';
+-------------------------------------+----------------+
| Variable_name | Value |
+-------------------------------------+----------------+
| bulk_insert_buffer_size | 8388608 |
| innodb_buffer_pool_chunk_size | 134217728 |
| innodb_buffer_pool_dump_at_shutdown | ON |
| innodb_buffer_pool_dump_now | OFF |
| innodb_buffer_pool_dump_pct | 25 |
| innodb_buffer_pool_filename | ib_buffer_pool |
| innodb_buffer_pool_instances | 1 |
| innodb_buffer_pool_load_abort | OFF |
| innodb_buffer_pool_load_at_startup | ON |
| innodb_buffer_pool_load_now | OFF |
| innodb_buffer_pool_size | 134217728 |
| innodb_change_buffer_max_size | 25 |
| innodb_change_buffering | all |
| innodb_log_buffer_size | 16777216 |
| innodb_sort_buffer_size | 1048576 |
| join_buffer_size | 262144 |
| key_buffer_size | 16777216 |
| myisam_sort_buffer_size | 8388608 |
| net_buffer_length | 16384 |
| preload_buffer_size | 32768 |
| read_buffer_size | 131072 |
| read_rnd_buffer_size | 262144 |
| sort_buffer_size | 262144 |
| sql_buffer_result | OFF |
+-------------------------------------+----------------+
24 rows in set (0.00 sec)
我没有计划合并这些表格。我没有插入它们的原因(除了单个文件插入时间比录制下一个文件所需的时间长)是因为它们在服务器上创建的文件上完成了工作。天。最后,我至少需要前两个索引,因为需要基于这两个列的合理查询。