存在一个按时间戳记录信息的表:
-- table with timestamp, request_id (in production further columns)
create table log_data(dttm date, request_id number);
-- test data: timestamps in range [sysdate-3; sysdate], request_id in [1, 3]
insert into log_data
(select sysdate - dbms_random.value(0, 3) as dttm, 1 as request_id
from dual
connect by level <= 10000
union all
select sysdate - dbms_random.value(0, 3) as dttm, 2 as request_id
from dual
connect by level <= 10000
union all
select sysdate - dbms_random.value(0, 3) as dttm, 3 as request_id
from dual
connect by level <= 10000);
create index log_data_idx on log_data(dttm, request_id);
在一个时间间隔内按request_id计算请求的最佳方法是什么(让我们说一小时)?
首先(不好)尝试:
explain plan for
-- count same requests in time interval of an hour
with log_data_per_hour as
(select d.request_id,
d.dttm as dttm_from,
d.dttm + 1 / 24 as dttm_to,
count(*) as total
from log_data dd, log_data d
where dd.dttm between d.dttm and d.dttm + 1 / 24
and dd.request_id = d.request_id
group by d.request_id, d.dttm, d.dttm + 1 / 24)
-- find maximum per request
select request_id,
max(dttm_from) keep(dense_rank first order by total) dttm,
max(total)
from log_data_per_hour
group by request_id;
select plan_table_output
from table(dbms_xplan.display('plan_table', null, null));
/*
Plan hash value: 2118911212
----------------------------------------------------------------------------------
| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time |
----------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 3 | 105 | 1410 (98)| 00:00:24 |
| 1 | SORT GROUP BY NOSORT | | 3 | 105 | 1410 (98)| 00:00:24 |
| 2 | VIEW | | 832K| 27M| 1410 (98)| 00:00:24 |
| 3 | SORT GROUP BY | | 832K| 34M| 1410 (98)| 00:00:24 |
|* 4 | HASH JOIN | | 832K| 34M| 1379 (98)| 00:00:24 |
| 5 | TABLE ACCESS FULL| LOG_DATA | 31612 | 679K| 18 (0)| 00:00:01 |
| 6 | TABLE ACCESS FULL| LOG_DATA | 31612 | 679K| 18 (0)| 00:00:01 |
----------------------------------------------------------------------------------
Predicate Information (identified by operation id):
---------------------------------------------------
4 - access("DD"."REQUEST_ID"="D"."REQUEST_ID")
filter("DD"."DTTM">="D"."DTTM" AND
"DD"."DTTM"<=INTERNAL_FUNCTION("D"."DTTM")+.041666666666666666666666666666
6666666667)
Note
-----
- dynamic sampling used for this statement (level=4)
*/
第二次尝试:
explain plan for
-- count same requests in time interval of an hour
with log_data_per_hour as
(select d.request_id,
d.dttm as dttm_from,
d.dttm + 1 / 24 as dttm_to,
(select count(*)
from log_data dd
where dd.dttm between d.dttm and d.dttm + 1 / 24
and dd.request_id = d.request_id) as total
from log_data d)
-- find maximum per request
select request_id,
max(dttm_from) keep(dense_rank first order by total) dttm,
max(total)
from log_data_per_hour
group by request_id;
select plan_table_output
from table(dbms_xplan.display('plan_table', null, null));
/*
Plan hash value: 1803512279
-----------------------------------------------------------------------------------
| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time |
-----------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 31612 | 679K| 20 (10)| 00:00:01 |
| 1 | SORT AGGREGATE | | 1 | 22 | | |
|* 2 | FILTER | | | | | |
|* 3 | INDEX RANGE SCAN| LOG_DATA_IDX | 1 | 22 | 1 (0)| 00:00:01 |
| 4 | SORT GROUP BY | | 31612 | 679K| 20 (10)| 00:00:01 |
| 5 | TABLE ACCESS FULL| LOG_DATA | 31612 | 679K| 18 (0)| 00:00:01 |
-----------------------------------------------------------------------------------
Predicate Information (identified by operation id):
---------------------------------------------------
2 - filter(:B1<=:B2+.0416666666666666666666666666666666666667)
3 - access("DD"."DTTM">=:B1 AND "DD"."REQUEST_ID"=:B2 AND
"DD"."DTTM"<=:B3+.0416666666666666666666666666666666666667)
filter("DD"."REQUEST_ID"=:B1)
Note
-----
- dynamic sampling used for this statement (level=4)
*/
我想有一个比分选择更好的解决方案,因为这仍然非常昂贵。
非常感谢任何改进,提前谢谢!
答案 0 :(得分:1)
使用范围:
的窗口函数可以有效地完成间隔中的行数select request_id,
dttm as dttm_from,
count(*) over (partition by request_id order by dttm range between current row and interval '1' hour following) as total
from log_data
上面将计算从“当前”行开始的每个requeset_id的行数以及此后1小时间隔内的所有行。优点是这只需要一次表扫描
这可以用来取代CTE:
with log_data_per_hour as (
select request_id,
dttm as dttm_from,
count(*) over (partition by request_id order by dttm range between current row and interval '1' hour following) as total
from log_data
)
select request_id,
max(dttm_from) keep(dense_rank first order by total) dttm,
max(total)
from log_data_per_hour
group by request_id;
这将在我的计算机上返回以下执行计划:
PLAN_TABLE_OUTPUT
---------------------------------------------------------------------------------
Plan hash value: 3005213820
---------------------------------------------------------------------------------
| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time |
---------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 29920 | 1022K| 23 (5)| 00:00:01 |
| 1 | SORT GROUP BY NOSORT| | 29920 | 1022K| 23 (5)| 00:00:01 |
| 2 | VIEW | | 29920 | 1022K| 23 (5)| 00:00:01 |
| 3 | WINDOW SORT | | 29920 | 642K| 23 (5)| 00:00:01 |
| 4 | TABLE ACCESS FULL| LOG_DATA | 29920 | 642K| 22 (0)| 00:00:01 |
---------------------------------------------------------------------------------
Note
-----
- dynamic statistics used: dynamic sampling (level=2)
在我的电脑上运行大约50ms,而你的第二个语句大约需要600ms(平均5次运行)。
如果我交换索引中的列,我也会为你的第二个语句(大约200ms)获得更好的运行时间,这是有意义的,因为使用索引的第一列来查找标量中的匹配行更有效子选择。