我需要一些关于性能的Postgresql 9.6的帮助。 我所拥有的表格的一个非常简单的例子如下
CREATE TABLE invoice
(
id bigserial primary key,
some_field character varying(200)
);
CREATE TABLE invoice_item
(
id serial primary key,
invoice_id bigint,
article_number character varying(50),
quantity numeric(19,2) NOT NULL,
CONSTRAINT invoice_item_fk FOREIGN KEY (invoice_id)
REFERENCES invoice (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
);
CREATE INDEX invoice_some_field_idx ON invoice (some_field);
CREATE INDEX invoice_item_article_number_idx ON invoice_item (article_number);
我在发票表中使用了大约500 000行,在invoice_item中使用了150万行。
运行以下查询非常快
SELECT ii.article_number,
SUM(ii.quantity)
FROM invoice i INNER JOIN
invoice_item ii
ON i.id = ii.invoice_id
GROUP BY ii.article_number;
使用article_number上的索引,查询从~7秒到~55毫秒。
现在问题是在父表的列上使用group by时。
SELECT i.some_field, SUM(ii.quantity)
FROM invoice i INNER JOIN
invoice_item ii
ON i.id = ii.invoice_id
GROUP BY i.some_field;
无论some_field上是否有索引,此查询都将花费相同的时间(~5秒)。
我觉得我在这里错过了一些非常明显的东西。
---编辑----
我对这个查询规划很陌生,当然,在对上面的表进行更多测试时,与实际代码相比,我得到了非常不同的结果。
以下是实际的表格定义
CREATE TABLE receipt2
(
id serial NOT NULL,
version bigint NOT NULL,
store_number integer NOT NULL,
address1 character varying(200),
date_created timestamp without time zone NOT NULL,
round_off numeric(19,2) NOT NULL,
date_created_by_cash_register timestamp without time zone NOT NULL,
address2 character varying(200),
receipt_number integer NOT NULL,
application_version character varying(50),
control_box_serial_number_original character varying(200),
last_updated timestamp without time zone NOT NULL,
cash_register_user_id uuid NOT NULL,
control_code_copy character varying(200),
cash_register_number integer NOT NULL,
control_code_original character varying(200),
zip_code character varying(50),
receipt_footer character varying(20000),
phone_number character varying(50),
control_box_serial_number_copy character varying(200),
corporate_identity character varying(50) NOT NULL,
city character varying(200),
money_back numeric(19,2) NOT NULL,
number_of_copies_printed integer NOT NULL,
cash_register_user_username character varying(50) NOT NULL,
company_name character varying(200) NOT NULL,
email character varying(200),
website character varying(200),
CONSTRAINT receipt2_pkey PRIMARY KEY (id),
CONSTRAINT uk9f6f61365739562846c491f21efb UNIQUE (corporate_identity, store_number, cash_register_number, receipt_number)
)
WITH (
OIDS=FALSE
);
CREATE INDEX receipt2_cash_register_user_id_idx
ON receipt2 USING btree (cash_register_user_id);
CREATE INDEX receipt2_date_created_by_cash_register_idx
ON receipt2 USING btree (date_created_by_cash_register);
CREATE INDEX receipt2_store_number_idx
ON receipt2 USING btree (store_number);
CREATE INDEX receipt2corpidx
ON receipt2 USING btree (corporate_identity COLLATE pg_catalog."default");
CREATE INDEX receipt2corpstoreidx
ON receipt2 USING btree (store_number, corporate_identity COLLATE pg_catalog."default");
CREATE TABLE receipt_item2
(
id serial NOT NULL,
version bigint NOT NULL,
cost_excluding_vat numeric(19,2) NOT NULL,
account_number integer,
receipt_item_type character varying(255) NOT NULL,
article_group_id uuid,
supplier_number integer,
purchase_price_excluding_vat numeric(19,2) NOT NULL,
receipt_id bigint NOT NULL,
text character varying(20000),
promotion_id uuid,
price_including_vat numeric(19,2) NOT NULL,
discount_type character varying(255) NOT NULL,
profit_excluding_vat numeric(19,2) NOT NULL,
price_excluding_vat numeric(19,2) NOT NULL,
discount_amount_including_vat numeric(19,2) NOT NULL,
article_type character varying(255),
article_number character varying(50),
cost_including_vat numeric(19,2) NOT NULL,
purchase_cost_excluding_vat numeric(19,2) NOT NULL,
hidden boolean NOT NULL,
row_index integer NOT NULL,
quantity numeric(19,2) NOT NULL,
discount numeric(19,2) NOT NULL,
discount_amount_excluding_vat numeric(19,2) NOT NULL,
description character varying(200),
vat numeric(19,2) NOT NULL,
CONSTRAINT receipt_item2_pkey PRIMARY KEY (id),
CONSTRAINT fksohgmt8ntavcgj10ha2duc8lb FOREIGN KEY (receipt_id)
REFERENCES receipt2 (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
)
WITH (
OIDS=FALSE
);
CREATE INDEX receipt_item2_article_number_idx
ON receipt_item2 USING btree (article_number COLLATE pg_catalog."default");
查询1解释。这将非常快。大约55ms。
SELECT
article_number,
sum(quantity) AS "quantity",
sum(cost_excluding_vat) AS "costExcludingVat",
sum(cost_including_vat) AS "costIncludingVat",
sum(purchase_cost_excluding_vat) AS "purchaseCostExcludingVat",
sum(profit_excluding_vat) AS "profitExcludingVat"
FROM receipt2 receipt INNER JOIN receipt_item2 receipt_item ON receipt.id = receipt_item.receipt_id
WHERE
date_created_by_cash_register BETWEEN '2017-01-01' AND '2017-12-31'
AND receipt_item_type = 'ARTICLE'
GROUP BY article_number
LIMIT 100;
"Limit (cost=0.85..4821.60 rows=100 width=167)"
" -> GroupAggregate (cost=0.85..948001.24 rows=19665 width=167)"
" Group Key: receipt_item.article_number"
" -> Nested Loop (cost=0.85..925058.77 rows=1500000 width=35)"
" -> Index Scan using receipt_item2_article_number_idx on receipt_item2 receipt_item (cost=0.43..196242.77 rows=1500000 width=43)"
" Filter: ((receipt_item_type)::text = 'ARTICLE'::text)"
" -> Index Scan using receipt2_pkey on receipt2 receipt (cost=0.42..0.48 rows=1 width=4)"
" Index Cond: (id = receipt_item.receipt_id)"
" Filter: ((date_created_by_cash_register >= '2017-01-01 00:00:00'::timestamp without time zone) AND (date_created_by_cash_register <= '2017-12-31 00:00:00'::timestamp without time zone))"
查询2解释。无论cash_register_user_id上是否存在索引,此查询都将花费2.3秒。
SELECT
cash_register_user_id AS "userId",
sum(quantity) AS "quantity",
sum(cost_excluding_vat) AS "costExcludingVat",
sum(cost_including_vat) AS "costIncludingVat",
sum(purchase_cost_excluding_vat) AS "purchaseCostExcludingVat",
sum(profit_excluding_vat) AS "profitExcludingVat"
FROM receipt2 receipt INNER JOIN receipt_item2 receipt_item ON receipt.id = receipt_id
WHERE
date_created_by_cash_register BETWEEN '2017-01-01' AND '2017-12-31'
AND receipt_item_type = 'ARTICLE'
AND receipt.store_number = 1
GROUP BY cash_register_user_id
LIMIT 100;
"Limit (cost=154761.00..154761.45 rows=20 width=176)"
" -> HashAggregate (cost=154761.00..154761.45 rows=20 width=176)"
" Group Key: receipt.cash_register_user_id"
" -> Hash Join (cost=28135.00..132261.00 rows=1500000 width=44)"
" Hash Cond: (receipt_item.receipt_id = receipt.id)"
" -> Seq Scan on receipt_item2 receipt_item (cost=0.00..57133.00 rows=1500000 width=36)"
" Filter: ((receipt_item_type)::text = 'ARTICLE'::text)"
" -> Hash (cost=18955.00..18955.00 rows=500000 width=20)"
" -> Seq Scan on receipt2 receipt (cost=0.00..18955.00 rows=500000 width=20)"
" Filter: ((date_created_by_cash_register >= '2017-01-01 00:00:00'::timestamp without time zone) AND (date_created_by_cash_register <= '2017-12-31 00:00:00'::timestamp without time zone) AND (store_number = 1))"
这个问题的一些主题,但无论如何,下一个问题是将它排序。圣杯是能够对汇总的数量,成本等进行排序......
答案 0 :(得分:0)
真正的答案是看执行计划。但这可能会让你知道可能会发生什么。第一个查询基本上可以重写为:
SELECT ii.article_number, SUM(ii.quantity)
FROM invoice_item ii
WHERE EXISTS (SELECT 1 FROM invoice i WHERE i.id = ii.invoice_id)
GROUP BY ii.article_number;
反过来,可以通过扫描invoice_item(article_number)
上的索引来解决这个问题。可以为每个组编译信息,不需要散列或排序 - 只需查找每一行。
第二个查询中没有相同的方法可以避免为聚合工作。