我有一个名为samples的SQL表,定义如下:
sampledate (datetime, 24 records per day per parameter)
parameterID (int)
value (decimal)
valid (bit, 1=valid data, 0=invalid data)
夫妇sampleate和parameterid是唯一的。
每个采样的格式为02/02/2011 12:00,因此每天每个参数ID有24行或更少(例如,探针可能会失败或处于维护状态,并且输出的样本少于24个)。
我必须计算每个参数的平均每日值。只有在
时,平均值才有效条件1)对于给定的@parameter来说非常简单:
SELECT CONVERT(DATETIME, FLOOR(CONVERT(FLOAT, sampledate))) as avgdate,
AVG(value) as avg, parameterID,
isValid = CASE
WHEN COUNT(value) > 17 THEN 1
ELSE 0
END
FROM samples
WHERE parameterId=@parameter
GROUP BY parameterId, CONVERT(DATETIME, FLOOR(CONVERT(FLOAT, sampledate))), valid
HAVING valid = 1
ORDER BY sampledate
如何添加条件2,可以归结为在24小时范围内计算连续0,可能具有最佳性能?
我们有数百万个样本,而且游标很慢。
答案 0 :(得分:1)
一些想法和评论......
有很多方法可以将日期时间转换为仅限日期的值。我使用DATEADD(DAY,DATEDIFF(DAY,0,),0)。但是对于下面的代码,我建议我们只是假装有一个justDate字段,以缩短时间:)
序列很重要,表格没有“序列ID”。 ROW_NUMBER()可以给你这个......
ROW_NUMBER() OVER (PARTITION BY parameter_id, justDate ORDER BY sampledate) AS "sequence_id"
似乎有几种方法可以做到这一点。
对于每个样品,加入下一个样品,共五次。我不喜欢它,但它可能是最简单的选择......
WITH
sequenced_samples
AS
(
SELECT
parameterID AS "parameter_id",
sampledate AS "sample_date_time",
DATEADD(DAY, DATEDIFF(DAY, 0, sampledate), 0) AS "sample_date",
ROW_NUMBER() OVER (PARTITION BY parameter_id, DATEADD(DAY, DATEDIFF(DAY, 0, sampledate), 0) ORDER BY sampledate) AS "sequence_id",
CASE WHEN valid = 1 THEN value ELSE NULL END as "value",
-(valid - 1) AS "invalid" -- turns 1 to 0, and 0 to 1
FROM
samples
)
SELECT
"sample_1".parameter_id,
"sample_1".sample_date,
AVG(value) AS average_value
FROM
samples "sample_1"
LEFT JOIN
samples "sample_2"
ON "sample_2".parameter_id = "sample_1".parameter_id
AND "sample_2".sample_date = "sample_1".sample_date
AND "sample_2".sequence_id = "sample_1".sequence_id + 1
LEFT JOIN
samples "sample_3"
ON "sample_3".parameter_id = "sample_1".parameter_id
AND "sample_3".sample_date = "sample_1".sample_date
AND "sample_3".sequence_id = "sample_1".sequence_id + 2
LEFT JOIN
samples "sample_4"
ON "sample_4".parameter_id = "sample_1".parameter_id
AND "sample_4".sample_date = "sample_1".sample_date
AND "sample_4".sequence_id = "sample_1".sequence_id + 3
LEFT JOIN
samples "sample_5"
ON "sample_5".parameter_id = "sample_1".parameter_id
AND "sample_5".sample_date = "sample_1".sample_date
AND "sample_5".sequence_id = "sample_1".sequence_id + 4
GROUP BY
"sample_1".parameter_id,
"sample_1".sample_date
HAVING
5 > MAX("sample_1".invalid + "sample_2".invalid + "sample_3".invalid + "sample_4".invalid + "sample_5".invalid)
AND 17 < (COUNT(*) - SUM("sample_1".invalid))
下一个稍微更加智能(但只是略微)但我没有坐在任何可以访问MS SQL Server的地方,所以我不知道它是否更高效。
而不是4个连接,只需连接一次,但匹配5个连续样本。做两级分组。
WITH
sequenced_samples
AS
(
SELECT
parameterID AS "parameter_id",
sampledate AS "sample_date_time",
DATEADD(DAY, DATEDIFF(DAY, 0, sampledate), 0) AS "sample_date",
ROW_NUMBER() OVER (PARTITION BY parameter_id, DATEADD(DAY, DATEDIFF(DAY, 0, sampledate), 0) ORDER BY sampledate) AS "sequence_id",
CASE WHEN valid = 1 THEN value ELSE NULL END AS "value",
-(valid - 1) AS "invalid" -- Turns 0 to 1, and 1 to 0
FROM
samples
)
,
checked_samples
AS
(
SELECT
"sample".parameter_id,
"sample".sample_date,
"sample".value,
"sample".invalid,
SUM("next_5_samples".invalid) AS "sequence_invalidity"
FROM
samples "sample"
INNER JOIN
samples "next_5_samples"
ON "next_5_samples".parameter_id = "sample".parameter_id
AND "next_5_samples".sample_date = "sample".sample_date
AND "next_5_samples".sequence_id >= "sample".sequence_id + 1
AND "next_5_samples".sequence_id <= "sample".sequence_id + 4
GROUP BY
"sample".parameter_id,
"sample".sample_date,
"sample".valid,
"sample".value
)
SELECT
parameter_id,
sample_date,
AVG(value)
FROM
checked_samples
GROUP BY
parameter_id,
sample_date
HAVING
5 > MAX(sequence_invalidity)
AND 17 < (COUNT(*) - SUM(invalid))
最后一个选项是使用递归公用表表达式逐个循环遍历记录。这对于Cursor的编码来说更复杂,但是(根据我的经验)更快。
编辑: 以下查询在recursive-CTE中有一个左连接,现在它没有。
WITH
sequenced_samples
AS
(
SELECT
parameterID AS "parameter_id",
sampledate AS "sample_date_time",
DATEADD(DAY, DATEDIFF(DAY, 0, sampledate), 0) AS "sample_date",
ROW_NUMBER() OVER (PARTITION BY parameter_id, DATEADD(DAY, DATEDIFF(DAY, 0, sampledate), 0) ORDER BY sampledate) AS "sequence_id",
value,
valid
FROM
samples
)
,
recursed_totals
AS
(
SELECT
parameter_id,
sample_date,
sequence_id - 1 AS "next_sequence_id",
CASE WHEN valid = 1 THEN value ELSE 0 END AS "cumulative_value",
valid AS "cumulative_count",
CASE WHEN valid = 1 THEN 0 ELSE 1 END AS "cumulative_invalid",
CASE WHEN valid = 1 THEN 0 ELSE 1 END AS "max_cumulative_invalid"
FROM
sequenced_samples
WHERE
sequence_id = (
SELECT
COUNT(*)
FROM
sequenced_samples "look_up"
WHERE
"look_up".parameter_id = sequenced_samples.parameter_id
AND "look_up".sample_date = sequenced_samples.sample_date
)
UNION ALL
SELECT
"cumulative_samples".parameter_id,
"cumulative_samples".sample_date,
"next_sample".sequence_id - 1,
"cumulative_samples".cumuatlive_value + CASE WHEN "next_sample".valid = 1 THEN "next_sample".value ELSE 0 END,
"cumulative_samples".valid + ISNULL("next_sample".valid, 0),
CASE
WHEN "next_sample".valid = 0 THEN "cumulative_samples".cumulative_invalid + 1
WHEN "cumulative_samples".cumulative_invalid = 5 THEN 5
ELSE 0
END,
CASE
WHEN "next_sample".valid = 1 THEN "cumulative_samples".max_cumulative_invalid
WHEN "cumulative_samples".cumulative_invalid = "cumulative_samples".max_cumulative_invalid THEN "cumulative_samples".max_cumulative_invalid + 1
ELSE "cumulative_samples".max_cumulative_invalid
END
FROM
recursed_totals AS "cumulative_samples"
INNER JOIN
sequenced_samples AS "next_sample"
ON "next_sample".parameter_id = "cumulative_samples".parameter_id
AND "next_sample".sample_date = "cumulative_samples".sample_date
AND "next_sample".sequence_id = "cumulative_samples".next_sequence_id
)
SELECT
parameter_id,
sample_date,
CAST(cumulative_value AS FLOAT) / CAST(cumulative_count AS FLOAT) AS "average",
cumulative_count AS "valid_samples",
max_cumulative_invalid AS "max_consecutive_invalid_samples"
FROM
recursed_totals
WHERE
parameter_id = @parameter_id
答案 1 :(得分:1)
这是一种解决方案,它使用与Dems使用的方法基本相同的方法。我认为我的解决方案中的逻辑有点不同。 (或者它可能只是结构不同......)
WITH sortedsamples AS (
SELECT
sampledate,
parameterID,
value,
valid,
avgdate = CAST(FLOOR(CAST(sampledate AS float)) AS datetime),
rownum = ROW_NUMBER() OVER (
PARTITION BY parameterID, CAST(FLOOR(CAST(sampledate AS float)) AS datetime)
ORDER BY sampledate
)
FROM samples
)
SELECT
ss1.parameterID,
ss1.avgdate,
avg = AVG(value),
isValid = CAST(CASE
WHEN SUM(CAST(ss1.valid AS int)) < 18 THEN 0
ELSE MIN(CAST(ss1.valid | ISNULL(ss2.valid, 1) | ISNULL(ss3.valid, 1)
| ISNULL(ss4.valid, 1) | ISNULL(ss5.valid, 1) AS int))
END AS bit)
FROM sortedsamples ss1
LEFT JOIN sortedsamples ss2 ON ss1.avgdate = ss2.avgdate
AND ss1.parameterID = ss2.parameterID AND ss1.rownum = ss2.rownum + 1
LEFT JOIN sortedsamples ss3 ON ss1.avgdate = ss3.avgdate
AND ss1.parameterID = ss3.parameterID AND ss1.rownum = ss3.rownum + 2
LEFT JOIN sortedsamples ss4 ON ss1.avgdate = ss4.avgdate
AND ss1.parameterID = ss4.parameterID AND ss1.rownum = ss4.rownum + 3
LEFT JOIN sortedsamples ss5 ON ss1.avgdate = ss5.avgdate
AND ss1.parameterID = ss5.parameterID AND ss1.rownum = ss5.rownum + 4
GROUP BY ss1.parameterID, ss1.avgdate
答案 2 :(得分:1)
这是我的递归CTE解决方案,它是可参数化的:
WITH
seq_samples AS (
SELECT
sampledate, parameterID, value, valid,
avgdate = CAST(FLOOR(CAST(sampledate AS float)) AS datetime),
rownum = ROW_NUMBER() OVER (
PARTITION BY parameterID, CAST(FLOOR(CAST(sampledate AS float)) AS datetime)
ORDER BY sampledate)
FROM samples
),
rec_samples AS (
SELECT
sampledate, parameterID, value, valid, avgdate, rownum,
inv_seq_num = 1 - valid
FROM seq_samples
WHERE rownum = 1
UNION ALL
SELECT
ss.sampledate, ss.parameterID, ss.value, ss.valid, ss.avgdate, ss.rownum,
inv_seq_num = CASE ss.valid WHEN 1 THEN 0 ELSE rs.inv_seq_num + 1 END
FROM seq_samples ss
INNER JOIN rec_samples rs ON ss.avgdate = rs.avgdate
AND ss.parameterID = rs.parameterID AND ss.rownum = rs.rownum + 1
)
SELECT
avgdate,
parameterID,
avgvalue = AVG(value)
FROM rec_samples
GROUP BY avgdate, parameterID
HAVING SUM(CAST(valid AS int)) >= @minimal_valid_count
AND MAX(inv_seq_num) <= @critical_invalid_count
你的想法基本上是在这里实现的。使用附加编号,该编号仅应用于无效行,并且仅由日期转换和有效值的出现决定。最后,MAX
应用于编号列,以确定最大数量是否未超过@critical_invalid_count
。而对于其他参数,显然足以检查valid
属性的总和。
所以,你就是。
seq_samples
CTE 编辑(适用于原始查询的改编版本)。
seq_samples AS (
SELECT
*,
rownum = ROW_NUMBER() OVER (
PARTITION BY parameterID, avgdate
ORDER BY sampledate)
FROM (
SELECT
sampledate, parameterID, value, valid,
avgdate = CAST(FLOOR(CAST(sampledate AS float)) AS datetime)
FROM samples
) s
),
SSMS向我展示了我的原始查询与修改后的查询之间在性能上的显着,实际上令人难以置信的差异。 (这只是基于估计的执行计划中的数字。)我不知道你对我原来的解决方案做了哪些改编,但我希望我所目睹的改进不会因为它们而完全丧失。 / p>
答案 3 :(得分:0)
在您的数百万个样本中,每天少于5个无效值的百分比是多少?如果它是一个足够高的百分比,你就可以了,因为你可以轻松地将它们从光标处理中排除。
如果每天有5个或更多无效值的样本数仍然是数百万,那么您可能需要等待很长时间。
答案 4 :(得分:0)
你的解决方案非常有趣(我从中学到了很多东西),但我想知道它们是否可以改进。
例如,所有解决方案(可能除了使用递归cte的解决方案之外)在无效连续样本的数量(N)中不是参数化的。 我可以想象,未来不同的N可能会有不同的年份或参数。
我想知道是否可以从rownum()解决方案开始设计一个解决方案:如果我能找到一种方法来重置计数,那么在有效列上每次中断时,我可以简单地使我发现行有一天的行无效rownum&gt; N和有效= 0,这将是超级简单,快速和多功能。
我试着更好地解释这个想法:
假设我可以使用rownum或类似函数来获取此信息:
date par value valid rownum
2010-01-26 00:00:00.000 25 14.0000000000 1 1
2010-01-26 01:00:00.000 25 15.3000001907 1 2
2010-01-26 02:00:00.000 25 16.8999996185 1 3
2010-01-26 03:00:00.000 25 13.6000003815 1 4
2010-01-26 04:00:00.000 25 16.2000007629 1 5
2010-01-26 05:00:00.000 25 12.1999998093 -1 1
2010-01-26 06:00:00.000 25 17.2000007629 -1 2
2010-01-26 07:00:00.000 25 16.2999992371 1 1
2010-01-26 08:00:00.000 25 18.2999992371 1 2
2010-01-26 09:00:00.000 25 15.0000000000 1 3
2010-01-26 10:00:00.000 25 17.7000007629 1 4
2010-01-26 11:00:00.000 25 16.5000000000 1 5
2010-01-26 12:00:00.000 25 17.3999996185 1 6
2010-01-26 13:00:00.000 25 17.7000007629 1 7
2010-01-26 14:00:00.000 25 18.2999992371 1 8
2010-01-26 15:00:00.000 25 15.1000003815 -1 1
2010-01-26 16:00:00.000 25 16.5000000000 -1 2
2010-01-26 17:00:00.000 25 10.3999996185 -1 3
2010-01-26 18:00:00.000 25 10.8999996185 -1 4
2010-01-26 19:00:00.000 25 10.1000003815 -1 5 <-----!!!!
2010-01-26 20:00:00.000 25 13.6999998093 1 1
2010-01-26 21:00:00.000 25 12.6999998093 1 2
2010-01-26 22:00:00.000 25 15.3999996185 -1 1
2010-01-26 23:00:00.000 25 8.6000003815 -1 2
如果N = 5,则存在行
2010-01-26 19:00:00.000 25 10.1000003815 -1 5
表示整天无效(更不用说无效数据的总数)
您如何看待这个想法?
(我不知道这应该是一个编辑还是一个明确的答案)
答案 5 :(得分:0)
以下是使用CROSS APPLY并避免使用ROW_NUMBER()的另一个答案。
但是,如果超过5个样本在同一时间发生,则对于相同的parameterID,这将无法正常工作。如果是这种情况,则需要再次使用ROW_NUMBER()。
SELECT
parameterID AS "parameter_id",
DATEADD(DAY, DATEDIFF(DAY, 0, sampledate), 0) AS "sample_date",
SUM(value) AS "total",
SUM(CASE WHEN valid = 1 THEN value ELSE 0 END) AS "total_valid",
COUNT(*) AS "count",
SUM(valid) AS "count_valid",
MAX(invalid) AS "date_invalidated"
FROM
samples
CROSS APPLY
(
SELECT
CASE WHEN SUM(valid) = 0 THEN 1 ELSE 0 END AS "invalid"
FROM
(
SELECT TOP 5
valid
FROM
samples AS "5_samples"
WHERE
"5_samples".parameterID = "samples".parameterID
AND "5_samples".sampledate >= "samples".sampledate
AND "5_samples".sampledate < DATEADD(DAY, DATEDIFF(DAY, 0, "samples".sampledate), 1)
ORDER BY
sampledate
)
AS "data"
)
AS "check"
WHERE
parameterID = @parameterID
GROUP BY
parameter_id,
DATEADD(DAY, DATEDIFF(DAY, 0, sampledate), 0)