我正在尝试清理与下表类似的数据集:
dataSource
| ID_dec | ID_base | name | field1 | field2 | field3 |
| 1.01 | 1 | AAA | Cat | Brown | Domesticated |
| 1.02 | 1 | AAA | Cat | Brown | Domesticated |
| 1.03 | 1 | AAA | Feline | NULL | Dom. |
| 1.04 | 1 | AAA | Beautiful cat | NULL | NULL |
| 1.05 | 1 | AAA | NULL | Light Brown | NULL |
| 2.01 | 2 | BBB | Dog | Black | Wild |
| 2.02 | 2 | BBB | Barker | NULL | NULL |
| 3.01 | 3 | CCC | Bird | Yellow | Domesticated |
| 4.01 | 4 | DDD | Snake | NULL | NULL |
| 4.02 | 4 | DDD | NULL | Green | NULL |
| 4.03 | 4 | DDD | NULL | Forest Green | NULL |
| 4.04 | 4 | DDD | NULL | Green | Wild |
| 4.05 | 4 | DDD | NULL | NULL | Wild |
我想拉出field[N]
和ID_base
的每个组合中最长的字符串,如下所示:
result
| ID_base | name | field1 | field2 | field3 |
| 1 | AAA | Beautiful cat | Light Brown | Domesticated |
| 2 | BBB | Barker | Black | Wild |
| 3 | CCC | Bird | Yellow | Domesticated |
| 4 | DDD | Snake | Forest Green | Wild |
This has been asked before,但仅在检查单个字段时。以下SQL为我提供了所需的结果,但在扩展到37个字段和5665行(4029 ID_base
s和ID_dec
个ID_base
的实际数据集时感觉效率低下是10):
SELECT DISTINCT a.id_base, a.name, b.result, c.result, d.result
FROM
dataSource a
LEFT JOIN
(
SELECT y.id_base, max(y.field1) result
FROM dataSource y
LEFT JOIN
(
SELECT id_base, max(len(field1)) leng
FROM dataSource
GROUP BY id_base
) z
ON y.id_base = z.id_base
WHERE len(y.field1) = z.leng
GROUP BY y.id_base
) b
ON a.id_base = b.id_base
LEFT JOIN
(
SELECT y.id_base, max(y.field2) result
FROM dataSource y
LEFT JOIN
(
SELECT id_base, max(len(field2)) leng
FROM dataSource
GROUP BY id_base
) z
ON y.id_base = z.id_base
WHERE len(y.field1) = z.leng
GROUP BY y.id_base
) c
ON a.id_base = c.id_base
LEFT JOIN
(
SELECT y.id_base, max(y.field3) result
FROM dataSource y
LEFT JOIN
(
SELECT id_base, max(len(field3)) leng
FROM dataSource
GROUP BY id_base
) z
ON y.id_base = z.id_base
WHERE len(y.field1) = z.leng
GROUP BY y.id_base
) d
ON a.id_base = d.id_base
这个查询的最佳方法是什么?
答案 0 :(得分:1)
WITH a AS (
SELECT id_base, name, max(len(field1)) l1, max(len(field2)) l2, max(len(field3)) l3
FROM datasource
GROUP BY id_base, name
)
SELECT a.*,
(SELECT TOP 1 field1 FROM datasource WHERE id_base = a.id_base AND len(field1) = a.l1),
(SELECT TOP 1 field2 FROM datasource WHERE id_base = a.id_base AND len(field2) = a.l2),
(SELECT TOP 1 field3 FROM datasource WHERE id_base = a.id_base AND len(field3) = a.l3)
from a
答案 1 :(得分:1)
另一个更简单的变化:
SELECT
t.id_base,
t.name
(SELECT TOP 1 field1 FROM table WHERE id_base = t.id_base ORDER BY LEN(field1) DESC),
(SELECT TOP 1 field2 FROM table WHERE id_base = t.id_base ORDER BY LEN(field2) DESC),
(SELECT TOP 1 field3 FROM table WHERE id_base = t.id_base ORDER BY LEN(field3) DESC)
FROM (SELECT DISTINCT id_base, name FROM table) t
答案 2 :(得分:0)
Select coalesce(t1.ID_base, t2.ID_base, t3.ID_base) base,
coalesce(t1.Name, t2.Name, t3.Name) Name,
coalesce(t1.field1, t2.field1, t3.field1) field1,
coalesce(t1.field2, t2.field2, t3.field2) field2,
coalesce(t1.field3, t2.field3, t3.field3) field3
from dataSource t1
full join dataSource t2 on t2.ID_base = t1.ID_base
and len(t1.field1) = (Select Max(len(field1)) from dataSource
where ID_base = t1.ID_base)
and len(t2.field2) = (Select Max(len(field2)) from dataSource
where ID_base = t2.ID_base)
full join dataSource t3 on t3.ID_base = t1.ID_base
and len(t3.field3) = (Select Max(len(field3)) from dataSource
where ID_base = t3.ID_base)