我在PostgreSQL 9.5中有以下查询,运行缓慢,测试fact
表有22000行,store
表295行和product
表10行。
create table calc_base as
select a.ticket_id ticket_id,
b.product_id,
b.product_desc,
a.promotion_flag promo_flag,
sum(quantity) num_units,
sum(sales) num_sales
from fact a
inner join product b on a.product_id = b.product_id
where store_id in (select store_id from store)
and b.product_id in (select product_id from fact
group by 1
order by count(distinct ticket_id) desc limit 5000)
group by 1,2,3,4;
以下是此查询的索引列表:
--Add primary keys
ALTER TABLE PRODUCT ADD PRIMARY KEY (PRODUCT_ID);
ALTER TABLE STORE ADD PRIMARY KEY (STORE_ID);
ALTER TABLE CUSTOMER ADD PRIMARY KEY (CUSTOMER_ID);
ALTER TABLE TIME ADD PRIMARY KEY (DATE_ID);
-- ADD FORIENG KEY without constraint
ALTER TABLE FACT ADD FOREIGN KEY (PRODUCT_ID) REFERENCES PRODUCT (PRODUCT_ID);
ALTER TABLE FACT ADD FOREIGN KEY (STORE_ID) REFERENCES STORE (STORE_ID);
ALTER TABLE FACT ADD FOREIGN KEY (DATE_ID) REFERENCES TIME (DATE_ID);
ALTER TABLE FACT ADD FOREIGN KEY (SEGMENT_ID) REFERENCES CUSTOMER (CUSTOMER_ID);
-- create indexes for forien keys
CREATE INDEX ON fact(PRODUCT_ID);
CREATE INDEX ON fact(STORE_ID);
CREATE INDEX ON fact(DATE_ID);
如何优化此查询以加快速度。我需要专业建议来优化聚合并在大数据集中分组,因为我的真实数据集有30亿个事务,我必须以30亿行运行此查询。
我使用更大的数据集where store_id in (select store_id from store)
200万行,fact
344117行,在没有product
的情况下运行以下查询。
create table calc_base as
select a.ticket_id ticket_id,
b.product_id,
b.product_desc,
a.promotion_flag promo_flag,
sum(quantity_abs) kpi_units,
sum(sales_abs) kpi_sales
from fact a
inner join product b on a.product_id = b.product_id
where b.product_id in (select product_id from fact
group by 1 order by count(distinct ticket_id) desc limit 5000 )
group by 1,2,3,4;
查询计划:
"GroupAggregate (cost=472629.13..505129.13 rows=1000000 width=39) (actual time=8230.069..11230.872 rows=2000000 loops=1)"
" Group Key: a.ticket_id, b.product_id, b.product_desc, a.promotion_flag"
" Buffers: shared hit=49090, temp read=19225 written=19225"
" -> Sort (cost=472629.13..475129.13 rows=1000000 width=39) (actual time=8230.053..9771.612 rows=2000000 loops=1)"
" Sort Key: a.ticket_id, b.product_id, b.product_desc, a.promotion_flag"
" Sort Method: external merge Disk: 98968kB"
" Buffers: shared hit=49090, temp read=19225 written=19225"
" -> Hash Join (cost=300396.32..357345.29 rows=1000000 width=39) (actual time=5424.131..6357.988 rows=2000000 loops=1)"
" Hash Cond: (a.product_id = b.product_id)"
" Buffers: shared hit=49076, temp read=6847 written=6847"
" -> Seq Scan on fact a (cost=0.00..39449.00 rows=2000000 width=25) (actual time=0.009..129.726 rows=2000000 loops=1)"
" Buffers: shared hit=19449"
" -> Hash (cost=298245.59..298245.59 rows=172058 width=26) (actual time=5423.385..5423.385 rows=914 loops=1)"
" Buckets: 262144 Batches: 1 Memory Usage: 2101kB"
" Buffers: shared hit=29627, temp read=6847 written=6847"
" -> Hash Join (cost=283334.26..298245.59 rows=172058 width=26) (actual time=5334.482..5423.097 rows=914 loops=1)"
" Hash Cond: (b.product_id = fact.product_id)"
" Buffers: shared hit=29627, temp read=6847 written=6847"
" -> Seq Scan on product b (cost=0.00..13616.17 rows=344117 width=20) (actual time=0.007..40.339 rows=344117 loops=1)"
" Buffers: shared hit=10175"
" -> Hash (cost=283328.34..283328.34 rows=473 width=6) (actual time=5334.458..5334.458 rows=914 loops=1)"
" Buckets: 1024 Batches: 1 Memory Usage: 51kB"
" Buffers: shared hit=19452, temp read=6847 written=6847"
" -> Limit (cost=283322.43..283323.61 rows=473 width=14) (actual time=5334.213..5334.288 rows=914 loops=1)"
" Buffers: shared hit=19452, temp read=6847 written=6847"
" -> Sort (cost=283322.43..283323.61 rows=473 width=14) (actual time=5334.210..5334.235 rows=914 loops=1)"
" Sort Key: (count(DISTINCT fact.ticket_id)) DESC"
" Sort Method: quicksort Memory: 67kB"
" Buffers: shared hit=19452, temp read=6847 written=6847"
" -> GroupAggregate (cost=268296.69..283301.42 rows=473 width=14) (actual time=2768.464..5333.807 rows=914 loops=1)"
" Group Key: fact.product_id"
" Buffers: shared hit=19449, temp read=6847 written=6847"
" -> Sort (cost=268296.69..273296.69 rows=2000000 width=14) (actual time=2756.509..3782.823 rows=2000000 loops=1)"
" Sort Key: fact.product_id"
" Sort Method: external merge Disk: 48080kB"
" Buffers: shared hit=19449, temp read=6014 written=6014"
" -> Seq Scan on fact (cost=0.00..39449.00 rows=2000000 width=14) (actual time=0.003..321.660 rows=2000000 loops=1)"
" Buffers: shared hit=19449"
"Planning time: 5.449 ms"
"Execution time: 12673.017 ms"
我可以根据查询计划进一步优化吗?