SQL:以完全方式查找两个外键子表之间的百分比重叠

时间:2016-06-22 20:41:49

标签: sql sql-server hash hashset

假设我有以下表格/字段:

CREATE TABLE tbl_projects (
  prjc_id int PRIMARY KEY
)

CREATE TABLE tbl_project_requirements (
  preq_prjc_id int -- Foreign key to tbl_projects
  preq_type_id int -- A standardized requirement category
)

鉴于某个特定项目,我想找到其他几乎类似需求类别的项目...或者说我们的要求至少有75%重叠

我可以做以下事情:

DECLARE @prjc_id int = 1

CREATE TABLE #project_reqs (type_id int)
INSERT INTO #project_reqs
SELECT preq_req_type_id
FROM tbl_project_requirements
WHERE preq_prjc_id = @prjc_id

SELECT prjc_id
FROM tbl_projects
  CROSS APPLY (
    SELECT CASE 
        WHEN COUNT(*) = 0 THEN 0.0
        ELSE COALESCE(SUM(CASE WHEN type_id = prjc_type_id THEN 1.0 ELSE 0.0 END), 0.0)
           / CONVERT(float, COUNT(*))
      END AS similarity
    FROM #project_reqs 
      FULL OUTER JOIN (
        SELECT prjc_type_id
        FROM tbl_project_requirements
        WHERE preq_prjc_id = prjc_id
      ) reqs ON preq_type_id = type_id
  ) reqs
WHERE prjc_id != @prjc_id
  AND similarity >= 0.75

在上文中,我将匹配的需求类别除以每两个项目之间的总不同需求类别,以获得%重叠。

虽然这有效,但我觉得代码闻起来,并且不认为这会很好地扩展。是否存在任何方法来执行两个项目之间的子记录重叠?也许某种部分哈希匹配或......?

更新

我想我找到了一个高效的解决方案:

DECLARE @prjc_id int = 1

CREATE TABLE #project_reqs (type_id int)
INSERT INTO #project_reqs
SELECT preq_req_type_id
FROM tbl_project_requirements
WHERE preq_prjc_id = @prjc_id

DECLARE @project_req_count float
SELECT @project_req_count = COUNT(*)
FROM #project_reqs

CREATE TABLE #projects (
  pj_prjc_id int,
  pj_func_count float,
  pj_func_common float
)

INSERT INTO #projects
SELECT preq_prjc_id,
  COUNT(*),
  COUNT(type_id)
FROM tbl_project_requirements
  LEFT OUTER JOIN #project_reqs
    ON preq_type_id = type_id
GROUP BY preq_prjc_id
HAVING COUNT(type_id) != 0

SELECT pj_prjc_id
FROM #projects
WHERE pj_func_common / (pj_func_count + @project_req_count - pj_func_common) >= 0.75

DROP TABLE #project_reqs
DROP TABLE #projects

2 个答案:

答案 0 :(得分:1)

找到共同的要求有更优雅的方法。

;with proj as (
    select preq_prjc_id pr, count(preq_type_id) typeCnt
    from tbl_project_requirements
    group by preq_prjc_id
)
,crossProj as (
    select p1.pr proj1,p2.pr proj2, p1.typeCnt
    from proj p1
    cross join proj p2 --make Cartesian product
    where p1.pr <> p2.pr
)
,req as (
    select preq_type_id, cp.proj1, cp.proj2, cp.typeCnt
    from tbl_project_requirements pq
    inner join crossProj cp on pq.preq_prjc_id=cp.proj1
    intersect -- what is common
    select preq_type_id, cp.proj1, cp.proj2, cp.typeCnt
    from tbl_project_requirements pq
    inner join crossProj cp on pq.preq_prjc_id=cp.proj2
)
--calculate final result
select proj1, proj2,
count(preq_type_id) commonPreq, 
--percent of common requirements relative to proj1
count(preq_type_id) * 100.00 / typeCnt [percentage]
from req
group by proj1, proj2, typeCnt
having count(preq_type_id) * 100.00 / typeCnt >75
order by [percentage] desc

更新

;with proj as (
    select preq_prjc_id pr, count(preq_type_id) typeCnt
    from tbl_project_requirements
    group by preq_prjc_id
)
,crossProj as (
    select p1.pr proj1,p2.pr proj2, p1.typeCnt
    from proj p1
    cross join proj p2 --make Cartesian product
    where p1.pr <> p2.pr
)
,req as (
    select preq_type_id, cp.proj1, cp.proj2
    from tbl_project_requirements pq
    inner join crossProj cp on pq.preq_prjc_id=cp.proj1
    intersect -- what is common
    select preq_type_id, cp.proj1, cp.proj2
    from tbl_project_requirements pq
    inner join crossProj cp on pq.preq_prjc_id=cp.proj2
)
--calculate final result
select proj1, proj2,
count(preq_type_id) commonPreq,
--percent of common requirements relative to proj1
count(preq_type_id) * 100.00 /(p1.typeCnt + p2.typeCnt - count(preq_type_id))  [percentage]
from req
inner join proj p1 on req.proj1=p1.pr
inner join proj p2 on req.proj2=p2.pr
group by proj1, proj2,p1.typeCnt, p2.typeCnt
having count(preq_type_id) * 100.00 /(p1.typeCnt + p2.typeCnt - count(preq_type_id)) >75
order by [percentage] desc

答案 1 :(得分:0)

如果您只为一个项目执行此操作,那么您应该已经知道75%匹配所需的匹配数(或者至少可以轻松快速地计算):

DECLARE @num_matches_required INT

SELECT @num_matches_required = CEILING(COUNT(*) * 0.75)
FROM
    tbl_Project_Requirements
WHERE
    preq_prjc_id = @preq_prjc_id

SELECT
    R2.preq_prjc_id    -- One reason not to use abbreviations... I have to think, "Is it proj? prj? prjc? prjct?"
FROM
    tbl_Project_Requirements R1
INNER JOIN tbl_Project_Requirements R2 ON
    R2.preq_type_id = R1.preq_type_id AND
    R2.preq_prjc_id <> @preq_prjc_id
WHERE
    R1.preq_prjc_id = @preq_prjc_id
GROUP BY
    R2.preq_prjc_id
HAVING
    COUNT(*) >= @num_matches_required