SQL比较集,第二部分:如何连接集合集

时间:2010-09-07 21:55:40

标签: sql sql-server tsql

question让我想起了整套比较中的几个相关问题。给出:

  1. collection套,
  2. 一个probe
  3. 三个问题:

    1. 如何在collection中找到与probe匹配的所有集合,元素元素?
    2. 如何在不使用显式循环结构的情况下找到collection中与probe集合匹配的所有集合?你如何加入套装?
    3. 这是关系师吗?如果不是,它是什么?
    4. 我对问题1有一个不错的解决方案(见下文)。

      对于问题2,我没有一个像样的关系解决方案。任何参与者?

      测试数据:

      IF OBJECT_ID('tempdb..#elements') IS NOT NULL DROP TABLE #elements
      IF OBJECT_ID('tempdb..#sets') IS NOT NULL DROP TABLE #sets
      
      CREATE TABLE #sets (set_no INT, PRIMARY KEY (set_no))
      CREATE TABLE #elements (set_no INT, elem CHAR(1), PRIMARY KEY (set_no, elem))
      
      INSERT #elements VALUES (1, 'A')
      INSERT #elements VALUES (1, 'B')
      INSERT #elements VALUES (1, 'C')
      INSERT #elements VALUES (1, 'D')
      INSERT #elements VALUES (1, 'E')
      INSERT #elements VALUES (1, 'F')
      INSERT #elements VALUES (2, 'A')
      INSERT #elements VALUES (2, 'B')
      INSERT #elements VALUES (2, 'C')
      INSERT #elements VALUES (3, 'D')
      INSERT #elements VALUES (3, 'E')
      INSERT #elements VALUES (3, 'F')
      INSERT #elements VALUES (4, 'B')
      INSERT #elements VALUES (4, 'C')
      INSERT #elements VALUES (4, 'F')
      INSERT #elements VALUES (5, 'F')
      
      INSERT #sets SELECT DISTINCT set_no FROM #elements
      

      问题1的设置和解决方案,设置查找:

      IF OBJECT_ID('tempdb..#probe') IS NOT NULL DROP TABLE #probe
      CREATE TABLE #probe (elem CHAR(1) PRIMARY KEY (elem))
      INSERT #probe VALUES ('B')
      INSERT #probe VALUES ('C')
      INSERT #probe VALUES ('F')
      
      -- I think this works.....upvotes for anyone who can demonstrate otherwise
      SELECT set_no FROM #sets s
      WHERE NOT EXISTS (
        SELECT * FROM #elements i WHERE i.set_no = s.set_no AND NOT EXISTS (
          SELECT * FROM #probe p WHERE p.elem = i.elem))
      AND NOT EXISTS (
        SELECT * FROM #probe p WHERE NOT EXISTS (
          SELECT * FROM #elements i WHERE i.set_no = s.set_no AND i.elem = p.elem))
      

      设置问题2,没有解决方案:

      IF OBJECT_ID('tempdb..#multi_probe') IS NOT NULL DROP TABLE #multi_probe
      CREATE TABLE #multi_probe (probe_no INT, elem CHAR(1) PRIMARY KEY (probe_no, elem))
      INSERT #multi_probe VALUES (1, 'B')
      INSERT #multi_probe VALUES (1, 'C')
      INSERT #multi_probe VALUES (1, 'F')
      INSERT #multi_probe VALUES (2, 'C')
      INSERT #multi_probe VALUES (2, 'F')
      INSERT #multi_probe VALUES (3, 'A')
      INSERT #multi_probe VALUES (3, 'B')
      INSERT #multi_probe VALUES (3, 'C')
      
      -- some magic here.....
      
      -- result set:
      -- probe_no | set_no
      ------------|--------
      -- 1        | 4
      -- 3        | 2
      

3 个答案:

答案 0 :(得分:2)

好的,让我们一步一步解决问题2:

(1)内部连接集和探针各自的元素。通过这种方式,我们将看到测试集和探测集如何相关(哪些集合与哪个探针具有哪些共同点):

SELECT
    e.set_no AS [test set],
    m.set_no AS [probe set],
    e.elem [common element]
FROM
    @elements e
JOIN
    @multi_probe m ON e.elem = m.elem

结果:

test set    probe set   common element
----------- ----------- --------------
1           3           A
1           1           B
1           3           B
1           1           C
1           2           C
1           3           C
1           1           F
1           2           F
2           3           A
2           1           B
2           3           B
2           1           C
2           2           C
2           3           C
3           1           F
3           2           F
4           1           B
4           3           B
4           1           C
4           2           C
4           3           C
4           1           F
4           2           F
5           1           F
5           2           F

(2)计算每个测试集和探针集之间的公共元素数量(内部连接意味着我们已经将“不匹配”放在一边)

SELECT
    e.set_no AS [test set],
    m.set_no AS [probe set],
    COUNT(*) AS [common element count]
FROM
    @elements e
    JOIN
        @multi_probe m ON e.elem = m.elem
GROUP BY
    e.set_no, m.set_no
ORDER BY
    e.set_no, m.set_no

结果:

 test set    probe set   common element count
----------- ----------- --------------------
1           1           3
1           2           2
1           3           3
2           1           2
2           2           1
2           3           3
3           1           1
3           2           1
4           1           3
4           2           2
4           3           2
5           1           1
5           2           1

(3)在每一行上带上测试集和探针集的计数(子查询可能不是最优雅的)

SELECT
    e.set_no AS [test set],
    m.set_no AS [probe set],
    COUNT(*) AS [common element count],
    (SELECT COUNT(*) FROM @elements e1 WHERE e1.set_no = e.set_no) AS [test set count],
    (SELECT COUNT(*) FROM @multi_probe m1 WHERE m1.set_no = m.set_no) AS [probe set count]
FROM
    @elements e
    JOIN @multi_probe m ON e.elem = m.elem
GROUP BY
    e.set_no, m.set_no
ORDER BY
    e.set_no, m.set_no

结果:

test set    probe set   common element count test set count probe set count
----------- ----------- -------------------- -------------- ---------------
1           1           3                    6              3
1           2           2                    6              2
1           3           3                    6              3
2           1           2                    3              3
2           2           1                    3              2
2           3           3                    3              3
3           1           1                    3              3
3           2           1                    3              2
4           1           3                    3              3
4           2           2                    3              2
4           3           2                    3              3
5           1           1                    1              3
5           2           1                    1              2

(4)找到解决方案:只保留那些具有相同元素数的测试集和探测集,这个数字也是公共元素的数量,即测试集和探测集是相同的

SELECT
    e.set_no AS [test set],
    m.set_no AS [probe set]
FROM
    @elements e
JOIN
    @multi_probe m ON e.elem = m.elem
GROUP BY
    e.set_no, m.set_no
HAVING
    COUNT(*) = (SELECT COUNT(*) FROM @elements e1 WHERE e1.set_no = e.set_no)
    AND (SELECT COUNT(*) FROM @elements e1 WHERE e1.set_no = e.set_no) = (SELECT COUNT(*) FROM @multi_probe m1 WHERE m1.set_no = m.set_no)
ORDER BY
    e.set_no, m.set_no

结果:

test set    probe set
----------- -----------
2           3
4           1

请原谅@而不是#,我更喜欢表变量:)

答案 1 :(得分:1)

我可以用SQL Server语法为问题(1)提交一个更“数学倾向”的解决方案:

SELECT
    s.set_no
FROM
    #sets s
    JOIN @elements e ON s.set_no = e.set_no
    LEFT JOIN #probe p ON e.elem = p.elem
GROUP BY
    s.set_no
HAVING
    COUNT(DISTINCT p.elem) = COUNT(*)
    AND COUNT(*) = (SELECT COUNT(*) FROM #probe)
  • COUNT(*)将始终表示每个测试集中的元素数量(因为LEFT JOIN
  • COUNT(DISTINCT p.elem)将表示测试集中的元素与探针集中的元素之间的“匹配”数(因为NULL不会被计算),即中有多少元素探针组也存在于测试集中

翻译成数学术语COUNT(DISTINCT p.elem) = COUNT(*)表示测试集是探测集的一个子集(test ⊆ probe),而COUNT(*) = (SELECT COUNT(*) FROM #probe)表示测试集的基数等于探针集的基数(|test| = |probe|)。从这两个条件我们得出结论test = probe

答案 2 :(得分:0)

[回答我自己的问题......]

首先,解决方案。 EXCEPT语法可以优雅地处理多个列和NULL,因此这更接近于一般解决方案:

SELECT 
  s.set_no AS test_set_no
, p.set_no AS probe_set_no
FROM #test_sets s CROSS JOIN #probe_sets p
WHERE NOT EXISTS (
    SELECT elem FROM #test_elements  te WHERE te.set_no = s.set_no EXCEPT 
    SELECT elem FROM #probe_elements pe WHERE pe.set_no = p.set_no)
  AND NOT EXISTS (
    SELECT elem FROM #probe_elements pe WHERE pe.set_no = p.set_no EXCEPT
    SELECT elem FROM #test_elements  te WHERE te.set_no = s.set_no)
ORDER BY 
  test_set_no
, probe_set_no

接下来,修改后的数据集:

IF OBJECT_ID('tempdb..#test_elements') IS NOT NULL DROP TABLE #test_elements
IF OBJECT_ID('tempdb..#test_sets') IS NOT NULL DROP TABLE #test_sets

CREATE TABLE #test_sets (set_no INT, PRIMARY KEY (set_no))
CREATE TABLE #test_elements (set_no INT, elem CHAR(1), PRIMARY KEY (set_no, elem))

INSERT #test_elements VALUES (1, 'A')
INSERT #test_elements VALUES (1, 'B')
INSERT #test_elements VALUES (1, 'C')
INSERT #test_elements VALUES (1, 'D')
INSERT #test_elements VALUES (1, 'E')
INSERT #test_elements VALUES (1, 'F')
INSERT #test_elements VALUES (2, 'A')
INSERT #test_elements VALUES (2, 'B')
INSERT #test_elements VALUES (2, 'C')
INSERT #test_elements VALUES (3, 'D')
INSERT #test_elements VALUES (3, 'E')
INSERT #test_elements VALUES (3, 'F')
INSERT #test_elements VALUES (4, 'B')
INSERT #test_elements VALUES (4, 'C')
INSERT #test_elements VALUES (4, 'F')
INSERT #test_elements VALUES (5, 'F')

INSERT #test_sets SELECT DISTINCT set_no FROM #test_elements

IF OBJECT_ID('tempdb..#probe_elements') IS NOT NULL DROP TABLE #probe_elements
IF OBJECT_ID('tempdb..#probe_sets') IS NOT NULL DROP TABLE #probe_sets
CREATE TABLE #probe_sets (set_no INT PRIMARY KEY (set_no))
CREATE TABLE #probe_elements (set_no INT, elem CHAR(1) PRIMARY KEY (set_no, elem))

INSERT #probe_elements VALUES (1, 'B')
INSERT #probe_elements VALUES (1, 'C')
INSERT #probe_elements VALUES (1, 'F')
INSERT #probe_elements VALUES (2, 'C')
INSERT #probe_elements VALUES (2, 'F')
INSERT #probe_elements VALUES (3, 'A')
INSERT #probe_elements VALUES (3, 'B')
INSERT #probe_elements VALUES (3, 'C')

INSERT #probe_sets SELECT DISTINCT set_no FROM #probe_elements

相比之下,根据Cyber​​Dude使用聚合:

SELECT
  e.set_no AS [test set]
, m.set_no AS [probe set]
FROM #test_elements e
JOIN #probe_elements m ON e.elem = m.elem
GROUP BY 
  e.set_no
, m.set_no
HAVING (SELECT COUNT(*) FROM #test_elements  e1 WHERE e1.set_no = e.set_no) 
     = (SELECT COUNT(*) FROM #probe_elements m1 WHERE m1.set_no = m.set_no)
   AND (SELECT COUNT(*) FROM #test_elements  e1 WHERE e1.set_no = e.set_no)
     = COUNT(*) 
ORDER BY
  e.set_no
, m.set_no