我使用libpuzzle分析了300万张图片。我的主服务器200万,另一个100万。我想将这些信息合并到一个MySQL数据库中。
我需要在test_images_pending
数据库中记录并将它们插入到test_images
中,但我必须以没有重复数据的方式进行。
test_images
在所有表格中总共有1.15亿条记录,单词有1.1亿条记录。尺寸~4.4 GB
test_images_pending
尊重6900万和6500万。尺寸~2.6 GB
我的计算机上有8GB内存,如果必须的话,我愿意在内存中加载所有内容(或尝试),以加快速度。
我希望对我的代码和/或使MySQL更快的技术进行一些优化我可以将速率从每秒约2张图片(从test_images_pending.picture表)提高到更易于管理的速度。至少每秒会有100张图片。
以下是test_images
和test_images_pending
的表格设置:
--
-- Table structure for table `errors`
--
CREATE TABLE IF NOT EXISTS `errors` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(255) NOT NULL,
`num` int(11) NOT NULL,
`pid` bigint(20) unsigned NOT NULL,
`error` varchar(512) NOT NULL,
`datetime` datetime NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=245688 ;
-- --------------------------------------------------------
--
-- Table structure for table `pictures`
--
CREATE TABLE IF NOT EXISTS `pictures` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`digest` char(32) NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `idx_digest` (`digest`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=1107725 ;
-- --------------------------------------------------------
--
-- Table structure for table `signatures`
--
CREATE TABLE IF NOT EXISTS `signatures` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`compressed_signature` varchar(338) NOT NULL,
`picture_id` int(11) NOT NULL,
PRIMARY KEY (`id`),
KEY `picture_id` (`picture_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=1107725 ;
-- --------------------------------------------------------
--
-- Table structure for table `stored_pictures`
--
CREATE TABLE IF NOT EXISTS `stored_pictures` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(255) NOT NULL,
`pid` bigint(20) unsigned NOT NULL,
`num` int(11) NOT NULL,
`updated_at` datetime DEFAULT NULL,
`created_at` datetime DEFAULT NULL,
`picture_id` int(11) NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `idx_url` (`url`),
KEY `idx_picture_id` (`picture_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=2773867 ;
-- --------------------------------------------------------
--
-- Table structure for table `words`
--
CREATE TABLE IF NOT EXISTS `words` (
`pos_and_word` char(5) NOT NULL,
`signature_id` int(11) NOT NULL,
KEY `idx_pos_and_word` (`pos_and_word`),
KEY `signature_id` (`signature_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
--
这是我运行的php PDO代码:
<html>
<head>
<link href="../css/print.css" rel="stylesheet" type="text/css" media="print" /> <!-- siehe screen.css -->
<link href="../css/screen.css" rel="stylesheet" type="text/css" media="screen, projection" />
<!--[if lte IE 6]><link rel="stylesheet" href="../css/ielte6.css" type="text/css" media="screen" /><![endif]-->
</head>
<body>
<?php
ini_set('max_execution_time', 0);
$dbh = new PDO("mysql:host=127.0.0.1;port=3306;dbname=test_images_pending;charset=utf-8", "root", "");
$dbh->setAttribute( PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION );
$dbh->setAttribute(PDO::ATTR_AUTOCOMMIT, FALSE);
try {
$query = "select id,digest from test_images_pending.pictures";
$sth = $dbh->prepare($query);
$sth->execute();
while ($pending_pictures_rows = $sth->fetch(PDO::FETCH_ASSOC, PDO::FETCH_ORI_NEXT)) {
// Print out what id it's on.
print $pending_pictures_rows['id']."<br>";
buffer_flush();
try {
$dbh->beginTransaction();
$query = "SELECT COUNT(id) from test_images.pictures WHERE digest = :digest";
$sth1 = $dbh->prepare($query);
$sth1->bindParam(':digest', $pending_pictures_rows['digest']);
$sth1->execute();
$count = $sth1->fetchColumn();
if ($count == 1) {
$query = "SELECT id from test_images.pictures WHERE digest = :digest";
$sth2 = $dbh->prepare($query);
$sth2->bindParam(':digest', $pending_pictures_rows['digest']);
$sth2->execute();
$correct_pic_id = $sth2->fetchColumn();
if(!isset($correct_pic_id) or empty($correct_pic_id)) {
throw new PDOException('correct_pic_id was empty');
}
$query = "select * from test_images_pending.stored_pictures WHERE picture_id = :picture_id";
$sth3 = $dbh->prepare($query);
$sth3->bindParam(':picture_id', $pending_pictures_rows['id']);
$sth3->execute();
while ($row = $sth3->fetch(PDO::FETCH_ASSOC, PDO::FETCH_ORI_NEXT)) {
$query = "INSERT INTO test_images.stored_pictures
(id, url, pid, num, updated_at, created_at, picture_id)
VALUES
(default, :url, :pid, :num, :updated_at, :created_at, :picture_id);";
$sth4 = $dbh->prepare($query);
$sth4->bindParam(':url', $row['url']);
$sth4->bindParam(':pid', $row['pid']);
$sth4->bindParam(':num', $row['num']);
$sth4->bindParam(':updated_at', $row['updated_at']);
$sth4->bindParam(':created_at', $row['created_at']);
$sth4->bindParam(':picture_id', $correct_pic_id);
$sth4->execute();
}
$query = "DELETE FROM test_images_pending.stored_pictures WHERE picture_id = :picture_id;";
$sth5 = $dbh->prepare($query);
$sth5->bindParam(':picture_id', $pending_pictures_rows['id']);
$sth5->execute();
$query = "select id from test_images_pending.signatures WHERE picture_id = :picture_id;";
$sth6 = $dbh->prepare($query);
$sth6->bindParam(':picture_id', $pending_pictures_rows['id']);
$sth6->execute();
$signature_id = $sth6->fetchColumn();
if(!isset($signature_id) or empty($signature_id)) {
throw new PDOException('signature_id was empty');
}
$query = "DELETE FROM test_images_pending.words WHERE signature_id = :signature_id;";
$sth7 = $dbh->prepare($query);
$sth7->bindParam(':signature_id', $signature_id);
$sth7->execute();
$query = "DELETE FROM test_images_pending.signatures WHERE picture_id = :picture_id";
$sth8 = $dbh->prepare($query);
$sth8->bindParam(':picture_id', $pending_pictures_rows['id']);
$sth8->execute();
$query = "DELETE FROM test_images_pending.pictures WHERE digest = :digest";
$sth9 = $dbh->prepare($query);
$sth9->bindParam(':digest', $pending_pictures_rows['digest']);
$sth9->execute();
} else if ($count == 0){
$query = "INSERT INTO test_images.pictures
(id, digest)
VALUES
(default, :digest);";
$sth2 = $dbh->prepare($query);
$sth2->bindParam(':digest', $pending_pictures_rows['digest']);
$sth2->execute();
$new_pic_id = $dbh->lastInsertId();
$query = "select * from test_images_pending.stored_pictures WHERE picture_id = :picture_id";
$sth3 = $dbh->prepare($query);
$sth3->bindParam(':picture_id', $pending_pictures_rows['id']);
$sth3->execute();
while ($row = $sth3->fetch(PDO::FETCH_ASSOC, PDO::FETCH_ORI_NEXT)) {
$query = "INSERT INTO test_images.stored_pictures
(id, url, pid, num, updated_at, created_at, picture_id)
VALUES
(default, :url, :pid, :num, :updated_at, :created_at, :picture_id);";
$sth4 = $dbh->prepare($query);
$sth4->bindParam(':url', $row['url']);
$sth4->bindParam(':pid', $row['pid']);
$sth4->bindParam(':num', $row['num']);
$sth4->bindParam(':updated_at', $row['updated_at']);
$sth4->bindParam(':created_at', $row['created_at']);
$sth4->bindParam(':picture_id', $new_pic_id);
$sth4->execute();
}
$query = "DELETE FROM test_images_pending.stored_pictures WHERE picture_id = :picture_id;";
$sth5 = $dbh->prepare($query);
$sth5->bindParam(':picture_id', $pending_pictures_rows['id']);
$sth5->execute();
$query = "select id,compressed_signature from test_images_pending.signatures WHERE picture_id = :picture_id;";
$sth6 = $dbh->prepare($query);
$sth6->bindParam(':picture_id', $pending_pictures_rows['id']);
$sth6->execute();
$fetched = $sth6->fetch(PDO::FETCH_ASSOC);
$signature_id = $fetched['id'];
if(!isset($signature_id) or empty($signature_id)) {
print_r($sth6->fetch(PDO::FETCH_ASSOC));
throw new PDOException('signature_id was empty');
}
$compressed_signature = $fetched['compressed_signature'];
if(!isset($compressed_signature) or empty($compressed_signature)) {
print_r($sth6->fetch(PDO::FETCH_ASSOC));
throw new PDOException('compressed_signature was empty');
}
$query = "INSERT INTO test_images.signatures
(id, compressed_signature, picture_id)
VALUES
(default, :compressed_signature, :picture_id);";
$sth7 = $dbh->prepare($query);
$sth7->bindParam(':picture_id', $new_pic_id);
$sth7->bindParam(':compressed_signature', $compressed_signature);
$sth7->execute();
$new_sig_id = $dbh->lastInsertId();
$query = "SELECT pos_and_word FROM test_images_pending.words WHERE signature_id = :signature_id";
$sth8 = $dbh->prepare($query);
$sth8->bindParam(':signature_id', $signature_id);
$sth8->execute();
while ($row = $sth8->fetch(PDO::FETCH_ASSOC, PDO::FETCH_ORI_NEXT)) {
$query = "INSERT INTO test_images.words
(pos_and_word, signature_id)
VALUES
(:pos_and_word, :signature_id);";
$sth9 = $dbh->prepare($query);
$sth9->bindParam(':pos_and_word', $row['pos_and_word']);
$sth9->bindParam(':signature_id', $new_sig_id);
$sth9->execute();
}
$query = "DELETE FROM test_images_pending.words WHERE signature_id = :signature_id;";
$sth10 = $dbh->prepare($query);
$sth10->bindParam(':signature_id', $signature_id);
$sth10->execute();
$query = "DELETE FROM test_images_pending.signatures WHERE picture_id = :picture_id";
$sth11 = $dbh->prepare($query);
$sth11->bindParam(':picture_id', $pending_pictures_rows['id']);
$sth11->execute();
$query = "DELETE FROM test_images_pending.pictures WHERE digest = :digest";
$sth12 = $dbh->prepare($query);
$sth12->bindParam(':digest', $pending_pictures_rows['digest']);
$sth12->execute();
} else {
throw new PDOException("Found more than 1 match for the digest '{$pending_pictures_rows['digest']}' in 'test_images.pictures' ", $query);
}
$dbh->commit();
} catch (PDOException $e) {
$dbh->rollback();
print "<pre>"; print_r($e); print "</pre>"; exit;
}
}
try {
$dbh->beginTransaction();
$query = "SELECT * FROM test_images_pending.errors";
$sth13 = $dbh->prepare($query);
$sth13->execute();
while ($row = $sth13->fetch(PDO::FETCH_ASSOC, PDO::FETCH_ORI_NEXT)) {
$query = "INSERT INTO test_images.errors
(id, url, num, pid, error, datetime)
VALUES
(default, :url, :num, :pid, :error, :datetime);";
$sth14 = $dbh->prepare($query);
$sth14->bindParam(':url', $row['url']);
$sth14->bindParam(':num', $row['num']);
$sth14->bindParam(':pid', $row['pid']);
$sth14->bindParam(':error', $row['error']);
$sth14->bindParam(':datetime', $row['datetime']);
$sth14->execute();
}
$query = "DELETE FROM test_images_pending.errors WHERE 1";
$sth15 = $dbh->prepare($query);
$sth15->execute();
$dbh->commit();
} catch (PDOException $e) {
$dbh->rollback();
print "<pre>"; print_r($e); print "</pre>"; exit;
}
} catch (PDOException $e) {
print "<pre>"; print_r($e); print "</pre>"; exit;
}
function buffer_flush(){
echo str_pad('', 512);
echo '<!-- -->';
if(ob_get_length()){
@ob_flush();
@flush();
@ob_end_flush();
}
@ob_start();
}
?>
</body>
</html>
编辑:
一些分析:
这个INSERT每个非相似图片运行100次(到目前为止每6个约5个)。完成while循环通常需要0.5到0.9秒,每个INSERT平均为0.007。
$query = "INSERT INTO test_images.words
(pos_and_word, signature_id)
VALUES
(:pos_and_word, :signature_id);";
$sth9 = $dbh->prepare($query);
$sth9->bindParam(':pos_and_word', $row['pos_and_word']);
$sth9->bindParam(':signature_id', $new_sig_id);
$sth9->execute();
DELETE FROM test_images_pending.stored_pictures WHERE picture_id = :picture_id;
select * from test_images_pending.stored_pictures WHERE picture_id = :picture_id
DELETE FROM test_images_pending.stored_pictures WHERE picture_id = :picture_id;
每张相似的照片平均需要0.15秒左右(约为6分之一)。
编辑2:
通过此基准测试:http://we-love-php.blogspot.com/2012/08/mass-inserts-updates-sqlite-vs-mysql.html
只需简单地写入文本文件,例如:
,即可替换编辑1中前面提到的慢速循环$inserts = array();
while ($row = $sth8->fetch(PDO::FETCH_ASSOC, PDO::FETCH_ORI_NEXT)) {
$inserts[] = "(".$dbh->quote($row['pos_and_word']).", ".$dbh->quote($new_sig_id).")";
}
$query = "INSERT INTO imvu_images.words (pos_and_word, signature_id) VALUES " . implode(',',$inserts) . ";";
file_put_contents("inserts.sql", $query."\n", FILE_APPEND);
让它更快。但不是每秒100,更像是10-20。然后,我可以稍后执行SQL,它会立即运行。 (这就是为什么我认为我的代码存在问题)。我想要每秒100次的原因是因为我可以分析图像并将它们以每秒30个的速度插入到1个数据库中。按照这个速度,我可以更快地分析200万个图像并让它逐个插入,而不是大量插入行。这似乎不对,服务器可以下载30个图像,分析30个图像,然后在1秒内完成30个插入,但只是做这些不同的SQL语句甚至无法匹配。
编辑3:
使用以下内容更新了my.ini:
key_buffer_size=4000M
read_buffer_size=32M
read_rnd_buffer_size=200M
bulk_insert_buffer_size=1000M
myisam_max_sort_file_size=10000M
myisam_repair_threads=1
tmp_table_size = 1024M
max_heap_table_size = 1024M
join_buffer_size=8M
sort_buffer_size=8M
max_allowed_packet=32M
max_connect_errors=10
myisam_sort_buffer_size=256M
query_cache_limit=12M
query_cache_size=256M
query_cache_type=1
在不使用file_put_contents hack的情况下,似乎已经将性能提高了2倍。尽管如此,每秒5条记录并没有削减它。
答案 0 :(得分:1)
这个过程如此缓慢的原因并不是因为个别查询很慢 - 事实上,我对它的速度有多快感到惊讶 - 但是因为你通过循环一次一个地处理数百万条记录通过外部结果集中的每个记录。 SQL擅长的是一次性处理数百万条记录。
你的代码中有太多的业务逻辑让我想要为你重写整个事情,但我想你想重新编写代码
INSERT INTO test_images.pictures
(id, digest)
SELECT id, digest
from test_images_pending.pictures
where id not in
(select id from test_images.pictures)
对其他表格执行相同操作。这应该运行得非常快 - 如果你有一个好的索引方案,你几乎肯定会受到I / O限制。你一定应该达到每秒2个以上的记录!
答案 1 :(得分:0)
为什么不能使用Mysql存储过程?它们直接在Mysql服务器中执行,而不是从PHP执行查询。 http://dev.mysql.com/doc/refman/5.0/en/create-procedure.html
从php调用存储过程:
$res = mysql_query('call sp_sel_test()');
if ($res === FALSE) {
die(mysql_error());
}
您需要在连接时设置客户端标志,以便使用php存储过程。用这个: 的mysql_connect($这 - &GT; H,$这 - &GT; U,$这 - 指p,假,65536);
有关详细信息,请参阅MySQL Client Flags。
答案 2 :(得分:0)
编辑:主要问题是要插入的源表上的索引。建议在进行大量插入之前删除任何不需要的索引,然后重建后缀。
结合调整mysql设置和下面的代码,我能够让重复的图像(连接部分)在30秒内完成50,000次,25秒就是JOIN操作。
第二部分我正在使用NOT IN,这是大多数时间发生的地方,但它以每秒800条记录的速度插入,因此它超出了我的目标。
我打算将这个问题保持开放一段时间,以确定它是否可以更优化,因为我有3900万条记录要处理。
<html>
<head>
<link href="../css/print.css" rel="stylesheet" type="text/css" media="print" /> <!-- siehe screen.css -->
<link href="../css/screen.css" rel="stylesheet" type="text/css" media="screen, projection" />
<!--[if lte IE 6]><link rel="stylesheet" href="../css/ielte6.css" type="text/css" media="screen" /><![endif]-->
</head>
<body>
<?php
ini_set('max_execution_time', 0);
$benchmark = false;
$delete = false;
$dbh = new PDO("mysql:host=127.0.0.1;port=3306;dbname=test_images_pending;charset=utf-8", "root", "");
$dbh->setAttribute( PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION );
$dbh->setAttribute(PDO::ATTR_AUTOCOMMIT, FALSE);
$timers = array();
try {
$query = "SELECT * FROM test_images.pictures
INNER JOIN test_images_pending.pictures
USING ( digest )";
$sth = $dbh->prepare($query);
$sth->execute();
while ($join_rows = $sth->fetch(PDO::FETCH_NUM, PDO::FETCH_ORI_NEXT)) {
$digest = $join_rows[0];
$correct_pic_id = $join_rows[1];
$wrong_pic_id = $join_rows[2];
try {
$dbh->beginTransaction();
$query = "INSERT INTO test_images.stored_pictures
(url, pid, num, updated_at, created_at, picture_id)
SELECT
url, pid, num, updated_at, created_at, :correct_pic_id FROM test_images_pending.stored_pictures WHERE picture_id = :wrong_pic_id;";
$sth4 = $dbh->prepare($query);
$sth4->bindParam(':correct_pic_id', $correct_pic_id);
$sth4->bindParam(':wrong_pic_id', $wrong_pic_id);
$sth4->execute();
$dbh->commit();
} catch (PDOException $e) {
$dbh->rollback();
print "<pre>"; print_r($e); print "</pre>"; exit;
}
}
} catch (PDOException $e) {
print "<pre>"; print_r($e); print "</pre>"; exit;
}
try {
$query = "SELECT COUNT(id) FROM `signatures` WHERE (`id` - `picture_id` !=0) ";
$sth = $dbh->prepare($query);
$sth->execute();
$count = $sth->fetchColumn();
if($count > 0) {
die("we got a sig that aint matching its pic_id, we cant assume sig_id = pic_id. Back to drawing board");
}
$sth = null;
$query = " SELECT digest, id
FROM test_images_pending.pictures
WHERE digest NOT IN
(
SELECT digest
FROM test_images.pictures
)";
$sth = $dbh->prepare($query);
$sth->execute();
while ($not_in_rows = $sth->fetch(PDO::FETCH_NUM, PDO::FETCH_ORI_NEXT)) {
$digest = $not_in_rows[0];
$wrong_pic_id = $not_in_rows[1];
try {
$dbh->beginTransaction();
$query = "INSERT INTO test_images.pictures
(id, digest)
VALUES
(default, :digest);";
$sth2 = $dbh->prepare($query);
$sth2->bindParam(':digest', $digest);
$sth2->execute();
$new_pic_id = $dbh->lastInsertId();
$query = "INSERT INTO test_images.stored_pictures
(url, pid, num, updated_at, created_at, picture_id)
SELECT
url, pid, num, updated_at, created_at, :new_pic_id FROM test_images_pending.stored_pictures WHERE picture_id = :wrong_pic_id;";
$sth3 = $dbh->prepare($query);
$sth3->bindParam(':new_pic_id', $new_pic_id);
$sth3->bindParam(':wrong_pic_id', $wrong_pic_id);
$sth3->execute();
$query = "INSERT INTO test_images.signatures
(compressed_signature, picture_id)
SELECT
compressed_signature, :new_pic_id FROM test_images_pending.signatures WHERE picture_id = :wrong_pic_id;";
$sth4 = $dbh->prepare($query);
$sth4->bindParam(':new_pic_id', $new_pic_id);
$sth4->bindParam(':wrong_pic_id', $wrong_pic_id);
$sth4->execute();
$new_sig_id = $dbh->lastInsertId();
$query = "INSERT INTO test_images.words
(pos_and_word, signature_id)
SELECT
pos_and_word, :new_sig_id FROM test_images_pending.words WHERE signature_id = :old_sig_id
";
$sth9 = $dbh->prepare($query);
$sth9->bindParam(':old_sig_id', $wrong_pic_id);
$sth9->bindParam(':new_sig_id', $new_sig_id);
$sth9->execute();
$dbh->commit();
} catch (PDOException $e) {
$dbh->rollback();
print "<pre>"; print_r($e); print "</pre>"; exit;
}
}
} catch (PDOException $e) {
print "<pre>"; print_r($e); print "</pre>"; exit;
}
function buffer_flush(){
echo str_pad('', 512);
echo '<!-- -->';
if(ob_get_length()){
@ob_flush();
@flush();
@ob_end_flush();
}
@ob_start();
}
?>
</body>
</html>