大家好,我在Squid代理中使用日志守护进程(https://github.com/paranormal/blooper)将访问日志放入PostreSQL并创建了一个触发函数:
DECLARE
newtime varchar := EXTRACT (MONTH FROM NEW."time")::varchar;
newyear varchar := EXTRACT (YEAR FROM NEW."time")::varchar;
user_name varchar := REPLACE (NEW.user_name, '.', '_');
partname varchar := newtime || '_' || newyear;
tablename varchar := user_name || '.accesses_' || partname;
BEGIN
IF NEW.user_name IS NOT NULL THEN
EXECUTE 'CREATE SCHEMA IF NOT EXISTS ' || user_name;
EXECUTE 'CREATE TABLE IF NOT EXISTS '
|| tablename
|| '('
|| 'CHECK (user_name = ''' || NEW.user_name || ''' AND EXTRACT(MONTH FROM "time") = ' || newtime || ' AND EXTRACT (YEAR FROM "time") = ' || newyear || ')'
|| ') INHERITS (public.accesses)';
EXECUTE 'CREATE INDEX IF NOT EXISTS access_index_' || partname || '_user_name ON ' || tablename || ' (user_name)';
EXECUTE 'CREATE INDEX IF NOT EXISTS access_index_' || partname || '_time ON ' || tablename || ' ("time")';
EXECUTE 'INSERT INTO ' || tablename || ' SELECT $1.*' USING NEW;
END IF;
RETURN NULL;
END;
它的主要功能是通过user_name和访问的月份来进行表分区,从主干净表继承:
CREATE TABLE public.accesses
(
id integer NOT NULL DEFAULT nextval('accesses_id_seq'::regclass),
"time" timestamp with time zone NOT NULL,
time_response integer,
mac_source macaddr,
ip_source inet NOT NULL,
ip_destination inet,
user_name character varying(40),
http_status_code numeric(3,0) NOT NULL,
http_reply_size bigint NOT NULL,
http_request_method character varying(15) NOT NULL,
http_request_url character varying(4166) NOT NULL,
http_content_type character varying(100),
squid_hier_code character varying(20),
squid_request_status character varying(50),
user_id integer,
CONSTRAINT accesses_http_request_method_fkey FOREIGN KEY (http_request_method)
REFERENCES public.http_requests (method) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION,
CONSTRAINT accesses_http_status_code_fkey FOREIGN KEY (http_status_code)
REFERENCES public.http_statuses (code) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION,
CONSTRAINT accesses_user_id_fkey FOREIGN KEY (user_id)
REFERENCES public.users (id) MATCH SIMPLE
ON UPDATE NO ACTION ON DELETE NO ACTION
)
主要问题是按用户名和时间获取http_reply_size分组的总和,我的查询是:
SELECT
"time",
user_name,
sum(http_reply_size)
FROM
accesses
WHERE
extract(epoch from "time") BETWEEN 1516975122 AND 1516996722
GROUP BY
"time",
user_name
但是这个查询在服务器中非常慢(仅在2天内3' 237&997行)。因此,PostgreSQL可以根据需要优化查询,或者我需要使用另一个SQL或NoSQL系统。
答案 0 :(得分:0)
尝试在每个分区上包含CHECK
条件,因此不必扫描所有表格。
在我的情况下是这样的:
CREATE TABLE IF NOT EXISTS ' || table_name || '(
CONSTRAINT ' || pk || ' PRIMARY KEY (avl_id),
CHECK ( event_time >= ''' || begin_time || ''' AND event_time < ''' || end_time || ''' )
) INHERITS (avl_db.avl);
也不要使用需要计算每行值的extract(epoch from "time")
而不能使用您为"time"
创建的索引
所以请使用这样来获得索引的优势。
WHERE "time" >= '2018-01-01'::timestamp with time zone
and "time" < '2018-02-01'::timestamp with time zone