此问题是a previous question I asked
的更具体版本CREATE TABLE Test4_ClusterMatches
(
`match_index` INT UNSIGNED,
`cluster_index` INT UNSIGNED,
`id` INT NOT NULL AUTO_INCREMENT,
`tfidf` FLOAT,
PRIMARY KEY (`cluster_index`,`match_index`,`id`)
);
mysql> explain SELECT `match_index`, SUM(`tfidf`) AS total
FROM Test4_ClusterMatches WHERE `cluster_index` IN (1,2,3 ... 3000)
GROUP BY `match_index`;
它使用临时文件和文件来减慢
+----+-------------+----------------------+-------+---------------+---------+---------+------+-------+-----------------------------------------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+----------------------+-------+---------------+---------+---------+------+-------+-----------------------------------------------------------+
| 1 | SIMPLE | Test4_ClusterMatches | range | PRIMARY | PRIMARY | 4 | NULL | 51540 | Using where; Using index; Using temporary; Using filesort |
+----+-------------+----------------------+-------+---------------+---------+---------+------+-------+-----------------------------------------------------------+
使用当前索引,查询需要首先按cluster_index排序,以消除临时和文件排序的使用,但这样做会给sum(tfidf)带来错误的结果。
将主键更改为
PRIMARY KEY (`match_index`,`cluster_index`,`id`)
不使用文件排序或临时表,但它使用14,932,441行,因此它也会减慢
+----+-------------+----------------------+-------+---------------+---------+---------+------+----------+--------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+----+-------------+----------------------+-------+---------------+---------+---------+------+----------+--------------------------+
| 1 | SIMPLE | Test5_ClusterMatches | index | NULL | PRIMARY | 16 | NULL | 14932441 | Using where; Using index |
+----+-------------+----------------------+-------+---------------+---------+---------+------+----------+--------------------------+
Using tight index scan只运行搜索一个索引
mysql> explain SELECT match_index
, SUM(tfidf
) AS total
FROM Test4_ClusterMatches WHERE cluster_index
=3000
GROUP BY match_index
;
消除临时表和文件排序。
match_index
我不确定这是否可以用一些我还没有遇到过的魔法sql-fu来利用?
如何更改查询以使其使用3,000个cluster_indexes,避免使用临时文件和文件集而不需要使用14,932,441行?
使用表格
tfidf
下面的查询然后给出10行(0.41秒):)
cluster_index
但它使用临时文件和
match_index
我想知道无论如何通过消除使用临时和使用filesort来更快地获得它?
答案 0 :(得分:2)
我快速浏览了一下这就是我想出来的 - 希望它有所帮助......
drop table if exists cluster_matches;
create table cluster_matches
(
cluster_id int unsigned not null,
match_id int unsigned not null,
...
tfidf float not null default 0,
primary key (cluster_id, match_id) -- if this isnt unique add id to the end !!
)
engine=innodb;
select count(*) from cluster_matches
count(*)
========
17974591
select count(distinct(cluster_id)) from cluster_matches;
count(distinct(cluster_id))
===========================
1000000
select count(distinct(match_id)) from cluster_matches;
count(distinct(match_id))
=========================
6000
explain select
cm.match_id,
sum(tfidf) as sum_tfidf,
count(*) as count_tfidf
from
cluster_matches cm
where
cm.cluster_id between 5000 and 10000
group by
cm.match_id
order by
sum_tfidf desc limit 10;
id select_type table type possible_keys key key_len ref rows Extra
== =========== ===== ==== ============= === ======= === ==== =====
1 SIMPLE cm range PRIMARY PRIMARY 4 290016 Using where; Using temporary; Using filesort
runtime - 0.067 seconds.
相当可观的0.067秒的运行时间,但我认为我们可以做得更好。
您不得不原谅我不想输入/传递5000+随机cluster_ids的列表!
call sum_cluster_matches(null,1); -- for testing
call sum_cluster_matches('1,2,3,4,....5000',1);
sproc的大部分都不是非常优雅,但它只是将csv字符串拆分为单独的cluster_ids并填充临时表。
drop procedure if exists sum_cluster_matches;
delimiter #
create procedure sum_cluster_matches
(
in p_cluster_id_csv varchar(65535),
in p_show_explain tinyint unsigned
)
proc_main:begin
declare v_id varchar(10);
declare v_done tinyint unsigned default 0;
declare v_idx int unsigned default 1;
create temporary table tmp(cluster_id int unsigned not null primary key);
-- not every elegant - split the string into tokens and put into a temp table...
if p_cluster_id_csv is not null then
while not v_done do
set v_id = trim(substring(p_cluster_id_csv, v_idx,
if(locate(',', p_cluster_id_csv, v_idx) > 0,
locate(',', p_cluster_id_csv, v_idx) - v_idx, length(p_cluster_id_csv))));
if length(v_id) > 0 then
set v_idx = v_idx + length(v_id) + 1;
insert ignore into tmp values(v_id);
else
set v_done = 1;
end if;
end while;
else
-- instead of passing in a huge comma separated list of cluster_ids im cheating here to save typing
insert into tmp select cluster_id from clusters where cluster_id between 5000 and 10000;
-- end cheat
end if;
if p_show_explain then
select count(*) as count_of_tmp from tmp;
explain
select
cm.match_id,
sum(tfidf) as sum_tfidf,
count(*) as count_tfidf
from
cluster_matches cm
inner join tmp on tmp.cluster_id = cm.cluster_id
group by
cm.match_id
order by
sum_tfidf desc limit 10;
end if;
select
cm.match_id,
sum(tfidf) as sum_tfidf,
count(*) as count_tfidf
from
cluster_matches cm
inner join tmp on tmp.cluster_id = cm.cluster_id
group by
cm.match_id
order by
sum_tfidf desc limit 10;
drop temporary table if exists tmp;
end proc_main #
delimiter ;
call sum_cluster_matches(null,1);
count_of_tmp
============
5001
id select_type table type possible_keys key key_len ref rows Extra
== =========== ===== ==== ============= === ======= === ==== =====
1 SIMPLE tmp index PRIMARY PRIMARY 4 5001 Using index; Using temporary; Using filesort
1 SIMPLE cm ref PRIMARY PRIMARY 4 vldb_db.tmp.cluster_id 8
match_id sum_tfidf count_tfidf
======== ========= ===========
1618 387 64
1473 387 64
3307 382 64
2495 373 64
1135 373 64
3832 372 57
3203 362 58
5464 358 67
2100 355 60
1634 354 52
runtime 0.028 seconds.
解释计划和运行时有很大改进。
答案 1 :(得分:0)
如果cluster_index
条件中的WHERE
值是连续的,则代替IN
使用:
WHERE (cluster_index >= 1) and (cluster_index <= 3000)
如果值不是连续的,那么您可以创建一个临时表来保存带有索引的cluster_index
值,并使用INNER JOIN
到临时表。