是否有使此查询脚本更有效的方法?

时间:2020-04-23 15:24:11

标签: sql-server

我被赋予创建一个查询的任务,给定一个表,该查询将对每20行进行批处理聚合,并生成一个新表,该表具有给定表中5%的行。因此,如果给我一个100行表,那么新表将是5行,其中每一行都是行的集合:[1-20],[21-40],[41-60],[61-80] ,[81-100]。

此任务的目的是确定特定进程的“周期”,其中每个周期是一个单独的列“ Cycle_X”,其中“ X”是从1到12的数字。每个“ Cycle_X”列均填充有一个从0到4的数字表示优先级; 0表示不优先。每个“循环”都应该是顺序的,因此“ Cycle_1”先于“ Cycle_2”,而“ Cycle_2”先于“ Cycle_3”等。我的查询确定每个小型批生产的循环的方式是在批处理中选择“ Cycle_X”列最小零个数。为了希望能帮助我澄清上面已经解释的内容,以下是根据我的描述simplified example base tablesimplified example output table的图像,其中它们以5为一批进行汇总。

我已经在80k行以上的数据集中测试了查询,但是由于连接超时而无法完成,现在我的数据集超过一百万行。鉴于当前的情况,我正在远程工作,所以我认为远程连接不够稳定,无法完成该过程,但是由于我不确定这是确切的错误消息:

Msg 121,第20级,状态0,第0行
从服务器接收结果时发生传输级错误。 (提供者:TCP提供程序,错误:0-信号灯超时时间已到期。)

我首先进行的查询将为批次聚集创建一个基表,并为一个空表填充批次聚集。在声明并设置了一些变量之后,有一个while循环使用INSERT INTO语句一次填充最初为空的表一次(从基本表中的20行的最小批处理中填充一行)。使用大型CASE WHEN语句确定各个周期的最小零计数。完整的查询可以在下面找到,我希望在提高查询效率方面有所帮助。到目前为止,我所做的唯一更改是将表创建和while循环分离为单独的查询。

USE _dataBase

-- Create a temp table to perform batch aggregation on
CREATE TABLE dbo.baseTable
WITH (DISTRIBUTION = ROUND_ROBIN) AS
SELECT
    ROW_NUMBER() OVER (ORDER BY _time) AS idx,
    _time,
    CASE WHEN Cycle_1 = 0 THEN 1 ELSE 0 END AS z1, --logical index that detetcts zero and not zero
    CASE WHEN Cycle_2 = 0 THEN 1 ELSE 0 END AS z2,
    CASE WHEN Cycle_3 = 0 THEN 1 ELSE 0 END AS z3,
    CASE WHEN Cycle_4 = 0 THEN 1 ELSE 0 END AS z4,
    CASE WHEN Cycle_5 = 0 THEN 1 ELSE 0 END AS z5,
    CASE WHEN Cycle_6 = 0 THEN 1 ELSE 0 END AS z6,
    CASE WHEN Cycle_7 = 0 THEN 1 ELSE 0 END AS z7,
    CASE WHEN Cycle_8 = 0 THEN 1 ELSE 0 END AS z8,
    CASE WHEN Cycle_9 = 0 THEN 1 ELSE 0 END AS z9,
    CASE WHEN Cycle_10 = 0 THEN 1 ELSE 0 END AS z10
FROM dbo.srcTable
WHERE _date = 20200423 --this must be parameterised

-- Create a table to store the batch aggregation (The table name should be parameterised somehow)
CREATE TABLE dbo.aggTable
(
    startTime datetime NOT NULL,
    endTime datetime NOT NULL,
    sum1 int NOT NULL,
    sum2 int NOT NULL,
    sum3 int NOT NULL,
    sum4 int NOT NULL,
    sum5 int NOT NULL,
    sum6 int NOT NULL,
    sum7 int NOT NULL,
    sum8 int NOT NULL,
    sum9 int NOT NULL,
    sum10 int NOT NULL,
    cycle int NOT NULL
)
--Declare variables
DECLARE
    @maxIter int, --The amount of times the loop runs
    @numRows int, --Number of rows in the temp table
    @batch int, --The size of the batch aggregation
    @offSet int, --The stat row of a batch that is updated during the loop
    @limit int, --The end row of a batch that is updated during the loop
    @iter int --Loop counter
--Set variables
SET @numRows = (SELECT COUNT(idx) FROM dbo.baseTable)
SET @maxIter = @numRows / 20
SET @batch = 20
SET @offSet = 1
SET @limit = 20
SET @iter = 0

--The loop
WHILE @iter < @maxIter
BEGIN
    --Append to this table
    INSERT INTO dbo.aggTable
    (
        startTime,
        endTime,
        sum1,
        sum2,
        sum3,
        sum4,
        sum5,
        sum6,
        sum7,
        sum8,
        sum9,
        sum10,
        cycle
    )
    SELECT
        MIN(_time) AS startTime,
        MAX(_time) AS endTime,
        SUM(z1) AS sum1, --this counts the number of zeroes for Cycle_1
        SUM(z2) AS sum2,
        SUM(z3) AS sum3,
        SUM(z4) AS sum4,
        SUM(z5) AS sum5,
        SUM(z6) AS sum6,
        SUM(z7) AS sum7,
        SUM(z8) AS sum8,
        SUM(z9) AS sum9,
        SUM(z10) AS sum10,
        --this case when clause is used to determine the Cycle basd on the minimum count of zeroes
        CASE
            WHEN SUM(z1) <= SUM(z2) AND SUM(z1) <= SUM(z3) AND SUM(z1) <= SUM(z4) AND SUM(z1) <= SUM(z5) AND SUM(z1) <= SUM(z6) AND SUM(z1) <= SUM(z7) AND SUM(z1) <= SUM(z8) AND SUM(z1) <= SUM(z9) AND SUM(z1) <= SUM(z10) THEN 1
            WHEN SUM(z2) <= SUM(z1) AND SUM(z2) <= SUM(z3) AND SUM(z2) <= SUM(z4) AND SUM(z2) <= SUM(z5) AND SUM(z2) <= SUM(z6) AND SUM(z2) <= SUM(z7) AND SUM(z2) <= SUM(z8) AND SUM(z2) <= SUM(z9) AND SUM(z2) <= SUM(z10) THEN 2
            WHEN SUM(z3) <= SUM(z1) AND SUM(z3) <= SUM(z2) AND SUM(z3) <= SUM(z4) AND SUM(z3) <= SUM(z5) AND SUM(z3) <= SUM(z6) AND SUM(z3) <= SUM(z7) AND SUM(z3) <= SUM(z8) AND SUM(z3) <= SUM(z9) AND SUM(z3) <= SUM(z10) THEN 3
            WHEN SUM(z4) <= SUM(z1) AND SUM(z4) <= SUM(z2) AND SUM(z4) <= SUM(z3) AND SUM(z4) <= SUM(z5) AND SUM(z4) <= SUM(z6) AND SUM(z4) <= SUM(z7) AND SUM(z4) <= SUM(z8) AND SUM(z4) <= SUM(z9) AND SUM(z4) <= SUM(z10) THEN 4
            WHEN SUM(z5) <= SUM(z1) AND SUM(z5) <= SUM(z2) AND SUM(z5) <= SUM(z3) AND SUM(z5) <= SUM(z4) AND SUM(z5) <= SUM(z6) AND SUM(z5) <= SUM(z7) AND SUM(z5) <= SUM(z8) AND SUM(z5) <= SUM(z9) AND SUM(z5) <= SUM(z10) THEN 5
            WHEN SUM(z6) <= SUM(z1) AND SUM(z6) <= SUM(z2) AND SUM(z6) <= SUM(z3) AND SUM(z6) <= SUM(z4) AND SUM(z6) <= SUM(z5) AND SUM(z6) <= SUM(z7) AND SUM(z6) <= SUM(z8) AND SUM(z6) <= SUM(z9) AND SUM(z6) <= SUM(z10) THEN 6
            WHEN SUM(z7) <= SUM(z1) AND SUM(z7) <= SUM(z2) AND SUM(z7) <= SUM(z3) AND SUM(z7) <= SUM(z4) AND SUM(z7) <= SUM(z5) AND SUM(z7) <= SUM(z6) AND SUM(z7) <= SUM(z8) AND SUM(z7) <= SUM(z9) AND SUM(z7) <= SUM(z10) THEN 7
            WHEN SUM(z8) <= SUM(z1) AND SUM(z8) <= SUM(z2) AND SUM(z8) <= SUM(z3) AND SUM(z8) <= SUM(z4) AND SUM(z8) <= SUM(z5) AND SUM(z8) <= SUM(z6) AND SUM(z8) <= SUM(z7) AND SUM(z8) <= SUM(z9) AND SUM(z8) <= SUM(z10) THEN 8
            WHEN SUM(z9) <= SUM(z1) AND SUM(z9) <= SUM(z2) AND SUM(z9) <= SUM(z3) AND SUM(z9) <= SUM(z4) AND SUM(z9) <= SUM(z5) AND SUM(z9) <= SUM(z6) AND SUM(z9) <= SUM(z7) AND SUM(z9) <= SUM(z8) AND SUM(z9) <= SUM(z10) THEN 9
            ELSE 10 END AS cycle
    FROM dbo.baseTable
    WHERE rowNum >= @offSet --row number of start of batch
    AND rowNum <= @limit --row number of end of batch
    --Update loop variables
    SET @offSet = @offSet + @batch
    SET @limit = @limit + @batch
    SET @iter = @iter + 1
END

--drop the temp table
DROP TABLE dbo.baseTable
--view the aggregated data
SELECT *
FROM dbo.aggTable
ORDER BY startTime

请让我知道是否需要进一步澄清或该问题不适当。

1 个答案:

答案 0 :(得分:0)

没有样本数据很难做到这一点。但是,请看这对您有什么作用

未测试(如果不太明显)

declare @date date = '2020-04-23'
declare @srcTable table (_time datetime, cycle_1 int, cycle_2 int, cycle_3 int
                        ,cycle_4 int, cycle_5 int, cycle_6 int, cycle_7 int
                        ,cycle_8 int, cycle_9 int, cycle_10 int, _date date)



insert into @srcTable
select getdate(), 0, 0, 2, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 1, 1, 2, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 0, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 2, 1, 0, 1, 0, 9, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 0, 2, 0, 0, 9, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 4, 1, 0, 1, 0, 9, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 5, 1, 2, 1, 9, 9, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 2, 1, 2, 1, 0, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 1, 0, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 0, 2, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 5, 1, 2, 1, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 0, 0, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 2, 1, 2, 0, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 1, 1, 2, 0, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 0, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 3, 1, 2, 0, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 4, 0, 0, 1, 0, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 0, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 6, 1, 2, 1, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 5, 1, 2, 1, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 0, 2, 0, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 1, 0, 0, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 3, 2, 1, 0, 9, 2, 3, 4, 0, 2, '2020-04-23'
union all
select getdate(), 0, 0, 2, 1, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 0, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 2, 0, 2, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 0, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 0, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 4, 0, 0, 0, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 0, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 5, 0, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 0, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'



select *
    ,(select min(cycle) --what a convoluted way to get the minimum cycle number that equals the maximum number of zero values
        from (values (z1, 1), (z2, 2), (z3, 3), (z4, 4), (z5, 5), (z6, 6), (z7, 7), (z8, 8), (z9, 9), (z10, 10)) AS value(amt, cycle)
        inner join (select (select max(amt) FROM (VALUES (z1), (z2), (z3), (z4), (z5), (z6), (z7), (z8), (z9), (z10)) AS value(amt)) amt) maxamt on maxamt.amt = value.amt) cycle
from (
    select min(_time) startTime, max(_time) endTime -- get first level of aggregated information
        ,sum(CASE WHEN Cycle_1 = 0 THEN 1 ELSE 0 END) AS z1
        ,sum(CASE WHEN Cycle_2 = 0 THEN 1 ELSE 0 END) AS z2
        ,sum(CASE WHEN Cycle_3 = 0 THEN 1 ELSE 0 END) AS z3
        ,sum(CASE WHEN Cycle_4 = 0 THEN 1 ELSE 0 END) AS z4
        ,sum(CASE WHEN Cycle_5 = 0 THEN 1 ELSE 0 END) AS z5
        ,sum(CASE WHEN Cycle_6 = 0 THEN 1 ELSE 0 END) AS z6
        ,sum(CASE WHEN Cycle_7 = 0 THEN 1 ELSE 0 END) AS z7
        ,sum(CASE WHEN Cycle_8 = 0 THEN 1 ELSE 0 END) AS z8
        ,sum(CASE WHEN Cycle_9 = 0 THEN 1 ELSE 0 END) AS z9
        ,sum(CASE WHEN Cycle_10 = 0 THEN 1 ELSE 0 END) AS z10
    from (
        select *, ROW_NUMBER() OVER (ORDER BY _time) - 1 AS idx -- this gives us idx to aggregate on
        from @srcTable
        where _date = @date
    ) idx
    group by idx.idx / 20
) a