在SQL Server中复制Oracles范围窗口函数的最佳方法

时间:2012-03-08 18:53:32

标签: .net sql sql-server stored-procedures

我需要在SQL Server中执行此Oracle查询:

select case_id, channel_index,
     min(su_min) as sustained_min,
     max(su_max) as sustained_max
from (
    select case_id, channel_index, start_time,
        min(dms_value) over (partition by case_id, channel_index order by start_time 
             range numtodsinterval(3, 'minute') preceeding) as su_max,
        max(dms_value) over (partition by case_id, channel_index order by start_time 
             range numtodsinterval(3, 'minute') preceeding) as su_min, 
        min(start_time) over (partition by case_id, channel_index order by start_time)
             as first_time
    from  data_table order by start_time 
    ) as su_data
where  
    first_time + numtodsinterval(3, 'minute') <= start_time
group by
    case_id, channel_index

以下是我在基本T-SQL中尝试完成的工作,但是当案例有100万条+记录时,需要花费37分钟(之后我取消了查询):

ALTER procedure [dbo].[GetSustainedValues]( 
  @case_id int,
  @time_limit int, 
  @bypass_only bit = NULL)
as 
begin

DECLARE @time DateTime, @channelindex int, @lastchannelindex int
DECLARE @tmin float, @tmax float, @min float, @max float, @caseid int

DECLARE @results TABLE(case_id int, channel_index int, max float null, min float null)
DECLARE CursorName CURSOR FAST_FORWARD
    FOR SELECT start_time, channel_index from continuous_data where case_id = @case_id order by channel_index, start_time
OPEN CursorName
FETCH NEXT FROM CursorName INTO @time, @channelindex
SET @lastchannelindex = @channelindex
WHILE @@FETCH_STATUS = 0
BEGIN
    --PRINT 'hello' --'Chennel:' + CONVERT (VARCHAR(50), @channelindex,128) + '  Time:' + CONVERT (VARCHAR(50), @time,128)
    IF @lastchannelindex != @channelindex
    BEGIN
        --PRINT 'Starting new channel:' + CONVERT (VARCHAR(50), @channelindex,128)
        -- we are starting on a new channel so insert that data into the results
        -- table and reset the min/max
        INSERT INTO @results(case_id, channel_index, max, min) VALUES(@case_id, @lastchannelindex, @max, @min)
        SET @max = null
        SET @min = null
        SET @lastchannelindex = @channelindex
    END

    Select @tmax = MAX(dms_value), @tmin = MIN(dms_value)
    from continuous_data
    where case_id = @case_id and channel_index = @channelindex and start_time between DATEADD(s, -(@time_limit-1), @time) and @time 
    HAVING SUM(value_duration) >= @time_limit
    IF @@ROWCOUNT > 0
    BEGIN
        IF @max IS null OR @tmin > @max
        BEGIN
            --PRINT 'Setting max:' + CONVERT (VARCHAR(50), @tmin,128) + ' for channel:' + CONVERT (VARCHAR(50), @channelindex,128)
            set @max = @tmin
        END

        IF @min IS null OR @tmax < @min
        BEGIN
            set @min = @tmax
        END
    END
    --PRINT 'Max:' + CONVERT (VARCHAR(50), @max,128) + '  Min:' + CONVERT (VARCHAR(50), @min,128)
    FETCH NEXT FROM CursorName INTO @time, @channelindex
END
CLOSE CursorName
DEALLOCATE CursorName
--PRINT 'Max:' + CONVERT (VARCHAR(50), @max,128) + '  Min:' + CONVERT (VARCHAR(50), @min,128)
SELECT * FROM @results
end

这是使用CLR存储过程的好地方吗?还有其他任何想法可以使这个查询更有效吗?

编辑3-9-2012: 不要专注于“first_time”字段。它是确保3分钟窗口开始3分钟进入数据集。在我的查询中,我不关心first_time。我需要的是每个通道所有3分钟周期的最小/最大持续值。

以下是一些包含2个频道的示例数据。请注意,每个样本的持续时间并不总是相同:

CREATE TABLE #continuous_data
(
        case_id         int
    ,   channel_index   int
    ,   start_time      datetime
    ,   dms_value       float,
    ,   value_duration  smallint
)

INSERT #continuous_data VALUES (2081,   51, '2011-05-18 09:36:34.000',  90,     6)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:39.000',  94.8125,    1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:40.000',  95.4375,    1)
INSERT #continuous_data VALUES (2081,   51, '2011-05-18 09:36:40.000',  96,     6)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:41.000',  96.75,      1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:42.000',  98.0625,    2)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:44.000',  99.3125,    1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:45.000',  100.625,    1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:46.000',  101.9375,   2)
INSERT #continuous_data VALUES (2081,   51, '2011-05-18 09:36:46.000',  98,     6)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:48.000',  103.25,     1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:49.000',  104.5625,   1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:50.000',  105.8125,   2)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:52.000',  107.125,    1)
INSERT #continuous_data VALUES (2081,   51, '2011-05-18 09:36:52.000',  92,     6)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:53.000',  108.4375,   1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:54.000',  109.75,     1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:55.000',  111.0625,   2)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:57.000',  112.3125,   1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:58.000',  113.625,    1)
INSERT #continuous_data VALUES (2081,   51, '2011-05-18 09:36:58.000',  86,     6)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:36:59.000',  114.9375,   2)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:37:01.000',  116.25,     1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:37:02.000',  117.5,      1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:37:03.000',  118.8125,   2)
INSERT #continuous_data VALUES (2081,   51, '2011-05-18 09:37:04.000',  80,     6)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:37:05.000',  120.125,    1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:37:06.000',  121.4375,   1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:37:07.000',  122.75,     1)
INSERT #continuous_data VALUES (2081,   50, '2011-05-18 09:37:08.000',  124.0625,   1)

3 个答案:

答案 0 :(得分:0)

如果您要执行以下操作会怎么样?

SELECT dt2.case_id, dt2.channel_index, dtf.first_time, su_qry.su_min, su_qry.su_max
  FROM (SELECT   dt.case_id, dt.channel_index, dt.start_time, MIN (dms_value) AS su_min, MAX (dms_value) AS su_max
            FROM data_table dt
                 INNER JOIN
                 (SELECT case_id, channel_index, start_time, dateadd ('mi', start_time, -3) AS start_time_minus_3
                    FROM data_table) dtr
                 ON (    dt.case_id = dtr.case_id
                     AND dt.channel_index = dtr.channel_index
                     AND dt.start_time >= dtr.start_time_minus_3
                     AND dt.start_time <= start_time
                    )
        GROUP BY dt.case_id, dt.channel_index, dt.start_time) su_qry
       INNER JOIN
       (SELECT   case_id, channel_index, MIN (start_time)
            FROM data_table dt
        GROUP BY case_id, channel_index) dtf ON (su_qry.case_id = dtf.case_id AND su_qry.channel_index = dtf.channel_index)
       INNER JOIN data_table dt2 ON (su_qry.case_id = dt2.case_id AND su_qry.channel_index = dt2.channel_index)
 WHERE dateadd ('mi', dtf.first_time, 3) <= dt2.start_time

这不是100%,但我认为这可能会给你你想要的东西。 基本上,我们通过执行大于和小于连接来找到数据表中每行的过去3分钟的最小值和最大值。我们将这些结果加到我们的“第一次”计算中,最后加到WHERE谓词的主表中。

答案 1 :(得分:0)

如果我理解正确,您需要以下

对于每个case_id,channel_index组合:

  1. 找到所有3分钟窗口的最低MAX值(最小持续时间 值)
  2. 查找所有3分钟窗口的最高MIN值(最大值 持续价值)。
  3. 使用前3分钟的数据。如果自第一个(MIN)start_time值以来尚未过去3分钟,请排除该数据。
  4. Oracle查询和您的解决方案(存储过程和CLR存储过程)之间仍存在一些无法解释的差异:

    1. Oracle查询不能确保每个窗口的时差恰好是3分钟。它只需要前3分钟的最小/最大值。 WHERE子句first_time + numtodsinterval(3, 'minute') <= start_time删除前3分钟之前的时间窗口。
    2. value_duration列位于示例数据中,但未在解决方案中使用
    3. 示例数据不包含3分钟的数据,因此我将时间范围更改为10秒
    4. 您没有列出样本数据的预期结果
    5. <强>解 - 这可能不是最快的解决方案,但应该可行 -

      步骤0 :窗口时间范围 - 样本数据不包含3分钟的数据,因此我使用变量来保存窗口时间范围所需的秒数。对于实际数据,您可以使用180秒。

      DECLARE @seconds int
      SET @seconds = 10
      

      第1步:第一次 - 虽然first_time并不重要,但仍有必要确保我们不包含不完整的时间段。稍后将使用它在第一个完整时间段过去之前排除数据。

      -- Query to return the first_time, last_time, and range_time
      -- range_time is first complete time period using the time range
      SELECT  case_id 
          ,   channel_index 
          ,   MIN(start_time) AS first_time
          ,   DATEADD(ss, @seconds, MIN(start_time)) AS range_time
          ,   MAX(start_time) AS last_time
      FROM    #continuous_data 
      GROUP BY case_id, channel_index
      ORDER BY case_id, channel_index
      
      -- Results from the sample data
      case_id     channel_index first_time              range_time              last_time
      ----------- ------------- ----------------------- ----------------------- -----------------------
      2081        50            2011-05-18 09:36:39.000 2011-05-18 09:36:49.000 2011-05-18 09:37:08.000
      2081        51            2011-05-18 09:36:34.000 2011-05-18 09:36:44.000 2011-05-18 09:37:04.000
      

      第2步:时间窗 - Oracle查询使用partition by case_id, channel_index order by start_time range numtodsinterval(3, 'minute') preceeding查找子查询中的最小和最大dms_value以及first_time。由于SQL Server没有range功能,因此您需要使用子查询来定义3分钟的窗口。 Oracle查询使用range ... preceeding,因此SQL Server范围将使用带有负值的DATEADD

      -- Windowing for each time range. Window is the negative time
      -- range from each start_time row
      SELECT  case_id 
          ,   channel_index 
          ,   DATEADD(ss, -@seconds, start_time) AS window_start
          ,   start_time                         AS window_end
      FROM    #continuous_data 
      ORDER BY case_id, channel_index, start_time
      

      步骤3 :时间窗口的MIN / MAX - 接下来,您需要找到每个窗口的最小值和最大值。这是执行大部分计算的地方,需要大多数调试才能获得预期的结果。

      -- Find the maximum and minimum values for each window range
      -- I included the start_time min/max/diff for debugging
      SELECT  su.case_id 
          ,   su.channel_index 
          ,   win.window_end 
          ,   MAX(dms_value) AS dms_max
          ,   MIN(dms_value) AS dms_min
          ,   MIN(su.start_time) AS time_min
          ,   MAX(su.start_time) AS time_max
          ,   DATEDIFF(ss, MIN(su.start_time), MAX(su.start_time)) AS time_diff
      FROM    #continuous_data AS su
         JOIN (
              -- Windowing for each time range. Window is the negative time
              -- range from each start_time row
              SELECT  case_id 
                  ,   channel_index 
                  ,   DATEADD(ss, -@seconds, start_time) AS window_start
                  ,   start_time                         AS window_end
              FROM    #continuous_data 
          ) AS win
              ON (    su.case_id       = win.case_id
                  AND su.channel_index = win.channel_index)
         JOIN (
              -- Find the first_time and add the time range
              SELECT  case_id 
                  ,   channel_index 
                  ,   MIN(start_time)                        AS first_time
                  ,   DATEADD(ss, @seconds, MIN(start_time)) AS range_time
              FROM    #continuous_data 
              GROUP BY case_id, channel_index
          ) AS fir
              ON (    su.case_id       = fir.case_id
                  AND su.channel_index = fir.channel_index)
      WHERE   su.start_time BETWEEN win.window_start AND win.window_end
          AND win.window_end >= fir.range_time
      GROUP BY su.case_id, su.channel_index, win.window_end
      ORDER BY su.case_id, su.channel_index, win.window_end
      
      -- Results from sample data:
      case_id     channel_index window_end              dms_max                dms_min                time_min                time_max                time_diff
      ----------- ------------- ----------------------- ---------------------- ---------------------- ----------------------- ----------------------- -----------
      2081        50            2011-05-18 09:36:49.000 104.5625               94.8125                2011-05-18 09:36:39.000 2011-05-18 09:36:49.000 10
      2081        50            2011-05-18 09:36:50.000 105.8125               95.4375                2011-05-18 09:36:40.000 2011-05-18 09:36:50.000 10
      2081        50            2011-05-18 09:36:52.000 107.125                98.0625                2011-05-18 09:36:42.000 2011-05-18 09:36:52.000 10
      2081        50            2011-05-18 09:36:53.000 108.4375               99.3125                2011-05-18 09:36:44.000 2011-05-18 09:36:53.000 9
      2081        50            2011-05-18 09:36:54.000 109.75                 99.3125                2011-05-18 09:36:44.000 2011-05-18 09:36:54.000 10
      2081        50            2011-05-18 09:36:55.000 111.0625               100.625                2011-05-18 09:36:45.000 2011-05-18 09:36:55.000 10
      2081        50            2011-05-18 09:36:57.000 112.3125               103.25                 2011-05-18 09:36:48.000 2011-05-18 09:36:57.000 9
      2081        50            2011-05-18 09:36:58.000 113.625                103.25                 2011-05-18 09:36:48.000 2011-05-18 09:36:58.000 10
      2081        50            2011-05-18 09:36:59.000 114.9375               104.5625               2011-05-18 09:36:49.000 2011-05-18 09:36:59.000 10
      2081        50            2011-05-18 09:37:01.000 116.25                 107.125                2011-05-18 09:36:52.000 2011-05-18 09:37:01.000 9
      2081        50            2011-05-18 09:37:02.000 117.5                  107.125                2011-05-18 09:36:52.000 2011-05-18 09:37:02.000 10
      2081        50            2011-05-18 09:37:03.000 118.8125               108.4375               2011-05-18 09:36:53.000 2011-05-18 09:37:03.000 10
      2081        50            2011-05-18 09:37:05.000 120.125                111.0625               2011-05-18 09:36:55.000 2011-05-18 09:37:05.000 10
      2081        50            2011-05-18 09:37:06.000 121.4375               112.3125               2011-05-18 09:36:57.000 2011-05-18 09:37:06.000 9
      2081        50            2011-05-18 09:37:07.000 122.75                 112.3125               2011-05-18 09:36:57.000 2011-05-18 09:37:07.000 10
      2081        50            2011-05-18 09:37:08.000 124.0625               113.625                2011-05-18 09:36:58.000 2011-05-18 09:37:08.000 10
      2081        51            2011-05-18 09:36:46.000 98                     96                     2011-05-18 09:36:40.000 2011-05-18 09:36:46.000 6
      2081        51            2011-05-18 09:36:52.000 98                     92                     2011-05-18 09:36:46.000 2011-05-18 09:36:52.000 6
      2081        51            2011-05-18 09:36:58.000 92                     86                     2011-05-18 09:36:52.000 2011-05-18 09:36:58.000 6
      2081        51            2011-05-18 09:37:04.000 86                     80                     2011-05-18 09:36:58.000 2011-05-18 09:37:04.000 6
      

      第4步:最后,您可以将它们放在一起,以返回每个时间窗口的最低MAX值和最高MIN值:

      SELECT  su.case_id 
          ,   su.channel_index 
          ,   MIN(dms_max) AS su_min
          ,   MAX(dms_min) AS su_max
      FROM    (
              SELECT  su.case_id 
                  ,   su.channel_index 
                  ,   win.window_end 
                  ,   MAX(dms_value) AS dms_max
                  ,   MIN(dms_value) AS dms_min
              FROM    #continuous_data AS su
                 JOIN (
                      -- Windowing for each time range. Window is the negative time
                      -- range from each start_time row
                      SELECT  case_id 
                          ,   channel_index 
                          ,   DATEADD(ss, -@seconds, start_time) AS window_start
                          ,   start_time                         AS window_end
                      FROM    #continuous_data 
                  ) AS win
                      ON (    su.case_id       = win.case_id
                          AND su.channel_index = win.channel_index)
                 JOIN (
                      -- Find the first_time and add the time range
                      SELECT  case_id 
                          ,   channel_index 
                          ,   MIN(start_time)                        AS first_time
                          ,   DATEADD(ss, @seconds, MIN(start_time)) AS range_time
                      FROM    #continuous_data 
                      GROUP BY case_id, channel_index
                  ) AS fir
                      ON (    su.case_id       = fir.case_id
                          AND su.channel_index = fir.channel_index)
              WHERE   su.start_time BETWEEN win.window_start AND win.window_end
                  AND win.window_end >= fir.range_time
              GROUP BY su.case_id, su.channel_index, win.window_end
      ) AS su
      GROUP BY su.case_id, su.channel_index
      ORDER BY su.case_id, su.channel_index
      
      -- Results from sample data:
      case_id     channel_index su_min                 su_max
      ----------- ------------- ---------------------- ----------------------
      2081        50            104.5625               113.625
      2081        51            86                     96
      

答案 2 :(得分:0)

好的,所以这里有一个解决问题的CLR存储过程。这将在大约3:05(分钟)内从包含110万条记录的案例中返回持续的最小/最大值。请告诉我是否有一种简单的T-SQL方法来实现这一点,因为我宁愿不走这条路。但是,关于如何提高速度的评论也将受到赞赏。

public partial class StoredProcedures
{
[Microsoft.SqlServer.Server.SqlProcedure]
public static void ComputeCaseSustainedChannelValues(int caseId, int seconds)
{
    SqlConnection con = new SqlConnection();
    SqlCommand cmd = new SqlCommand();

    try
    {
        con = new SqlConnection("context connection=true");
        con.Open();

        cmd = new SqlCommand(String.Format("Select channel_index, start_time, dms_value, value_duration from continuous_data where case_id = {0} and dms_type = 0 and error_code is NULL order by channel_index, start_time", caseId), con);
        SqlDataReader reader = cmd.ExecuteReader();

        Queue<ContinuousData> window = new Queue<ContinuousData>();
        ArrayList channelValues = new ArrayList();
        float? sus_min = null, sus_max = null;
        float? min = null, max = null;
        int currentChannel = -1;
        bool recalc = true;
        int recalccounter = 0;
        int rowcounter = 0;
        using (reader)
        {
            while (reader.Read())
            {
                var cd = new ContinuousData
                    {
                        ChannelIndex = reader.GetInt16(0),
                        StartTime = reader.GetDateTime(1),
                        DmsValue = (float)reader.GetSqlDouble(2),
                        Duration = reader.GetInt16(3)
                    };

                // check to make sure we are on the same channel. If not 
                // clear the queue and start over with the new channel
                if (currentChannel != cd.ChannelIndex)
                {
                    if (currentChannel != -1)
                    {
                        SqlContext.Pipe.Send(String.Format("Channel: {0}  Min: {1}  Max: {2}", currentChannel, sus_min, sus_max));
                    }
                    currentChannel = cd.ChannelIndex;
                    window.Clear();
                    sus_max = null;
                    sus_min = null;
                    recalc = true;
                }
                rowcounter++;
                window.Enqueue(cd);

                if (cd.StartTime.Subtract(window.Peek().StartTime).TotalSeconds >= seconds)
                {
                    if (recalc)
                    {
                        recalccounter++;
                        // a current sustained min max value was removed so recalc the window's min max
                        MinMax(window.ToArray(), out min, out max);
                        recalc = false;
                    }
                    else
                    {
                        // update the rolling min max based on the new value coming in
                        max = max == null || cd.DmsValue > max ? cd.DmsValue : max;
                        min = min == null || cd.DmsValue < min ? cd.DmsValue : min;
                    }

                    // update the sustained min max based on the current window's min max
                    sus_min = sus_min == null || max < sus_min ? max : sus_min;
                    sus_max = sus_max == null || min > sus_max ? min : sus_max;

                    // now that we calculated remove the first item
                    var firstitem = window.Dequeue();
                    if (firstitem.DmsValue == sus_min || firstitem.DmsValue == sus_max ||
                        firstitem.DmsValue == min || firstitem.DmsValue == max)
                    {
                        recalc = true;
                    }
                }
            }
        }
        if (sus_max != null && sus_min != null)
        {
            SqlContext.Pipe.Send(String.Format("Channel: {0}  Min: {1}  Max: {2}", currentChannel, sus_min, sus_max));
        }
        window.Clear();
        window = null;

        SqlContext.Pipe.Send(String.Format("Rows: {0}, Recalcs performed: {1}", rowcounter, recalccounter));
        SqlContext.Pipe.Send("Done!");
    }
    catch (Exception)
    {
        throw;
    }
    finally
    {
        con.Close();
        con.Dispose();
        cmd.Dispose();
    }
}

private static void MinMax(ContinuousData[] cd, out float? min, out float? max)
{
    min = cd[0].DmsValue;
    max = cd[0].DmsValue;

    for (int i = 0; i < cd.Length; i++)
    {
        if (min > cd[i].DmsValue)
            min = cd[i].DmsValue;
        if (max < cd[i].DmsValue)
            max = cd[i].DmsValue;
    }
}

public class ContinuousData
{
    public int ChannelIndex { get; set; }
    public DateTime StartTime { get; set; }
    public float DmsValue { get; set; }
    public int Duration { get; set; }
}

public class ChannelValues
{
    public int ChannelIndex { get; set; }
    public float SustainedMin { get; set; }
    public float SustainedMax { get; set; }
}
};