假设我有以下表格/字段:
CREATE TABLE tbl_projects (
prjc_id int PRIMARY KEY
)
CREATE TABLE tbl_project_requirements (
preq_prjc_id int -- Foreign key to tbl_projects
preq_type_id int -- A standardized requirement category
)
鉴于某个特定项目,我想找到其他几乎类似需求类别的项目...或者说我们的要求至少有75%重叠
我可以做以下事情:
DECLARE @prjc_id int = 1
CREATE TABLE #project_reqs (type_id int)
INSERT INTO #project_reqs
SELECT preq_req_type_id
FROM tbl_project_requirements
WHERE preq_prjc_id = @prjc_id
SELECT prjc_id
FROM tbl_projects
CROSS APPLY (
SELECT CASE
WHEN COUNT(*) = 0 THEN 0.0
ELSE COALESCE(SUM(CASE WHEN type_id = prjc_type_id THEN 1.0 ELSE 0.0 END), 0.0)
/ CONVERT(float, COUNT(*))
END AS similarity
FROM #project_reqs
FULL OUTER JOIN (
SELECT prjc_type_id
FROM tbl_project_requirements
WHERE preq_prjc_id = prjc_id
) reqs ON preq_type_id = type_id
) reqs
WHERE prjc_id != @prjc_id
AND similarity >= 0.75
在上文中,我将匹配的需求类别除以每两个项目之间的总不同需求类别,以获得%重叠。
虽然这有效,但我觉得代码闻起来,并且不认为这会很好地扩展。是否存在任何方法来执行两个项目之间的子记录重叠?也许某种部分哈希匹配或......?
我想我找到了一个高效的解决方案:
DECLARE @prjc_id int = 1
CREATE TABLE #project_reqs (type_id int)
INSERT INTO #project_reqs
SELECT preq_req_type_id
FROM tbl_project_requirements
WHERE preq_prjc_id = @prjc_id
DECLARE @project_req_count float
SELECT @project_req_count = COUNT(*)
FROM #project_reqs
CREATE TABLE #projects (
pj_prjc_id int,
pj_func_count float,
pj_func_common float
)
INSERT INTO #projects
SELECT preq_prjc_id,
COUNT(*),
COUNT(type_id)
FROM tbl_project_requirements
LEFT OUTER JOIN #project_reqs
ON preq_type_id = type_id
GROUP BY preq_prjc_id
HAVING COUNT(type_id) != 0
SELECT pj_prjc_id
FROM #projects
WHERE pj_func_common / (pj_func_count + @project_req_count - pj_func_common) >= 0.75
DROP TABLE #project_reqs
DROP TABLE #projects
答案 0 :(得分:1)
找到共同的要求有更优雅的方法。
;with proj as (
select preq_prjc_id pr, count(preq_type_id) typeCnt
from tbl_project_requirements
group by preq_prjc_id
)
,crossProj as (
select p1.pr proj1,p2.pr proj2, p1.typeCnt
from proj p1
cross join proj p2 --make Cartesian product
where p1.pr <> p2.pr
)
,req as (
select preq_type_id, cp.proj1, cp.proj2, cp.typeCnt
from tbl_project_requirements pq
inner join crossProj cp on pq.preq_prjc_id=cp.proj1
intersect -- what is common
select preq_type_id, cp.proj1, cp.proj2, cp.typeCnt
from tbl_project_requirements pq
inner join crossProj cp on pq.preq_prjc_id=cp.proj2
)
--calculate final result
select proj1, proj2,
count(preq_type_id) commonPreq,
--percent of common requirements relative to proj1
count(preq_type_id) * 100.00 / typeCnt [percentage]
from req
group by proj1, proj2, typeCnt
having count(preq_type_id) * 100.00 / typeCnt >75
order by [percentage] desc
;with proj as (
select preq_prjc_id pr, count(preq_type_id) typeCnt
from tbl_project_requirements
group by preq_prjc_id
)
,crossProj as (
select p1.pr proj1,p2.pr proj2, p1.typeCnt
from proj p1
cross join proj p2 --make Cartesian product
where p1.pr <> p2.pr
)
,req as (
select preq_type_id, cp.proj1, cp.proj2
from tbl_project_requirements pq
inner join crossProj cp on pq.preq_prjc_id=cp.proj1
intersect -- what is common
select preq_type_id, cp.proj1, cp.proj2
from tbl_project_requirements pq
inner join crossProj cp on pq.preq_prjc_id=cp.proj2
)
--calculate final result
select proj1, proj2,
count(preq_type_id) commonPreq,
--percent of common requirements relative to proj1
count(preq_type_id) * 100.00 /(p1.typeCnt + p2.typeCnt - count(preq_type_id)) [percentage]
from req
inner join proj p1 on req.proj1=p1.pr
inner join proj p2 on req.proj2=p2.pr
group by proj1, proj2,p1.typeCnt, p2.typeCnt
having count(preq_type_id) * 100.00 /(p1.typeCnt + p2.typeCnt - count(preq_type_id)) >75
order by [percentage] desc
答案 1 :(得分:0)
如果您只为一个项目执行此操作,那么您应该已经知道75%匹配所需的匹配数(或者至少可以轻松快速地计算):
DECLARE @num_matches_required INT
SELECT @num_matches_required = CEILING(COUNT(*) * 0.75)
FROM
tbl_Project_Requirements
WHERE
preq_prjc_id = @preq_prjc_id
SELECT
R2.preq_prjc_id -- One reason not to use abbreviations... I have to think, "Is it proj? prj? prjc? prjct?"
FROM
tbl_Project_Requirements R1
INNER JOIN tbl_Project_Requirements R2 ON
R2.preq_type_id = R1.preq_type_id AND
R2.preq_prjc_id <> @preq_prjc_id
WHERE
R1.preq_prjc_id = @preq_prjc_id
GROUP BY
R2.preq_prjc_id
HAVING
COUNT(*) >= @num_matches_required