Diagram relations-entities http://img11.hostingpics.net/pics/32979039DB.png
创建:
CREATE TABLE archive (
id integer NOT NULL,
parent_id integer,
code character varying(15) NOT NULL,
label text NOT NULL
);
ALTER TABLE ONLY archive ADD CONSTRAINT archive_pkey PRIMARY KEY (id);
CREATE INDEX idx_142 ON archive USING btree (parent_id);
CREATE UNIQUE INDEX uniq_14242 ON archive USING btree (code);
ALTER TABLE ONLY archive ADD CONSTRAINT fk_14242 FOREIGN KEY (parent_id) REFERENCES archive(id);
插入:
INSERT INTO archive VALUES (1, NULL, 'B28', 'Confidential');
INSERT INTO archive VALUES (2, 1, 'B28.0', 'Nuclear zone');
创建:
CREATE TABLE keyword (
id integer NOT NULL,
label text NOT NULL,
label_double_metaphone text NOT NULL
);
ALTER TABLE ONLY keyword ADD CONSTRAINT eyword_pkey PRIMARY KEY (id);
CREATE UNIQUE INDEX uniq_242 ON keyword USING btree (label);
插入:
INSERT INTO keyword VALUES (1, 'SECURITY', 'SKRT');
INSERT INTO keyword VALUES (2, 'AREA', 'AR');
INSERT INTO keyword VALUES (3, 'NUCLEAR', 'NKLR');
创建:
CREATE TABLE assoc_kw_archive (
id integer NOT NULL,
keyword_id integer,
archive_id integer,
weight integer NOT NULL
);
ALTER TABLE ONLY assoc_kw_archive ADD CONSTRAINT assoc_kw_archive_pkey PRIMARY KEY (id);
CREATE INDEX idx_3421 ON assoc_kw_archive USING btree (archive_id);
CREATE INDEX idx_3422 ON assoc_kw_archive USING btree (keyword_id);
ALTER TABLE ONLY assoc_kw_archive ADD CONSTRAINT fk_3421 FOREIGN KEY (archive_id) REFERENCES archive(id);
ALTER TABLE ONLY assoc_kw_archive ADD CONSTRAINT fk_3422 FOREIGN KEY (keyword_id) REFERENCES keyword(id);
插入:
INSERT INTO assoc_kw_archive VALUES (1, 1, 1, 10);
INSERT INTO assoc_kw_archive VALUES (2, 1, 2, 20);
INSERT INTO assoc_kw_archive VALUES (3, 2, 2, 30);
INSERT INTO assoc_kw_archive VALUES (4, 3, 2, 30);
这里的目标是在数据库中搜索。该研究基于用户键入的字符串。输出按相关性排序的档案列表。相关档案取决于三个因素:
我曾在不同版本的sql查询,但是,现在我不能退一步看看整体问题。
归档表由100,000个元组组成,关键字表为80,000,这两个实体之间有1,000,000个关联。
这是我的最后一个版本,她很实用,但速度非常慢:
select f.id, f.code, f.label, min(f.dist) as distF, max(f.poid) as poidF
from
(
select
a.id,
a.code,
a.label,
( ( levenshtein(lower('Security'), lower(k1.label)) + 1 ) + ( levenshtein(lower('Nuclear'), lower(k2.label)) + 1 ) ) as dist,
( ka1.weight + ka2.weight ) as poid
from archive a
inner join assoc_kw_archive ka1
on ka1.archive_id = a.id
inner join keyword k1
on k1.id = ka1.keyword_id
inner join assoc_kw_archive ka2
on ka2.archive_id = a.id
inner join keyword k2
on k2.id = ka2.keyword_id
where levenshtein(dmetaphone('Security'), k1.label_double_metaphone) < 2
and levenshtein(dmetaphone('Nuclear'), k2.label_double_metaphone) < 2
) as f
group by f.id, f.code, f.label
order by distF asc, poidF desc
limit 10;
我通过关键字进行了一次加入,这就是它变慢了!但我找不到另一种解决方案。
答案 0 :(得分:0)
我认为问题在于使用距离计算进行完全连接。这是另一种方法。首先过滤关键字。使用子查询将信息保存在where
子句中。然后使用条件聚合来获取所需的信息。
查询最终看起来像:
select a.id, a.code, a.label,
min( (levenshtein(lower('Security'), lower(case when securityl < 2 then k.label end)) + 1 ) +
(levenshtein(lower('Nuclear'), lower(case when nuclearl < 2 then k.label end)) + 1 )
) as mindist,
sum(weight) as poid
from archive a inner join
assoc_kw_archive ka
on ka.archive_id = a.id inner join
(select k.*, levenshtein(dmetaphone('Security'), k.label_double_metaphone) as securityl,
levenshtein(dmetaphone('Nuclear'), k.label_double_metaphone) as nuclearl
from keyword k
having securityl < 2 or
nuclearl < 2
) k
on k.id = ka.keyword_id
group by a.id, a.code, a.label