tricky query with patterns of consecutive rows

  • Hi!

    I'm having struggle with selecting rows, that match "dynamic" rules as described:

    • if there are 10 or more rows containing a value greater than 1000 this is my START signal
    • if there are 10 or more rows containing a value smaller than 1000 this is my STOP signal
    • i want everything between a START and a STOP block

    I think it has to do with "gaps and islands" and probably something with lead/lag to build group ids. But I don't know how to build the "counter" so I know if it's 10 or more records that match the pattern.

    Please see this simplified demo table (id is an INT, in real-world it's a timestamp of an event)

    drop table if exists patternDemo;

    create table patternDemo
    (
    id int,
    value int,
    include char(3) -- select record in final query?
    )


    -- ignore everything before START
    insert into patternDemo values (1, 100, 'no');
    insert into patternDemo values (2, 100, 'no');
    insert into patternDemo values (3, 100, 'no');
    insert into patternDemo values (4, 200, 'no');
    insert into patternDemo values (5, 100, 'no');
    insert into patternDemo values (6, 300, 'no');
    insert into patternDemo values (7, 500, 'no');
    -- START: block having >= 10 consecutive valus above 1000
    insert into patternDemo values (8, 1500, 'yes');
    insert into patternDemo values (9, 1500, 'yes');
    insert into patternDemo values (10, 1300, 'yes');
    insert into patternDemo values (11, 1600, 'yes');
    insert into patternDemo values (12, 1500, 'yes');
    insert into patternDemo values (13, 1700, 'yes');
    insert into patternDemo values (14, 1800, 'yes');
    insert into patternDemo values (15, 1800, 'yes');
    insert into patternDemo values (16, 1800, 'yes');
    insert into patternDemo values (17, 1800, 'yes');
    insert into patternDemo values (18, 1600, 'yes');
    insert into patternDemo values (19, 1500, 'yes');
    -- after a START block having at least 10 rows with a value over 1000
    -- there maybe lower values, which i want to include, since we're still
    -- before a STOP pattern
    insert into patternDemo values (20, 600, 'yes');
    insert into patternDemo values (21, 400, 'yes');
    insert into patternDemo values (22, 1500, 'yes');
    insert into patternDemo values (23, 1800, 'yes');
    insert into patternDemo values (24, 1800, 'yes');
    insert into patternDemo values (25, 1800, 'yes');
    -- STOP pattern: another block of at least 10 rows with a value lower than 1000
    insert into patternDemo values (26, 300, 'no');
    insert into patternDemo values (27, 300, 'no');
    insert into patternDemo values (28, 100, 'no');
    insert into patternDemo values (29, 100, 'no');
    insert into patternDemo values (30, 100, 'no');
    insert into patternDemo values (31, 100, 'no');
    insert into patternDemo values (32, 100, 'no');
    insert into patternDemo values (33, 100, 'no');
    insert into patternDemo values (34, 100, 'no');
    insert into patternDemo values (35, 100, 'no');
    insert into patternDemo values (36, 100, 'no');
    insert into patternDemo values (37, 100, 'no');
    insert into patternDemo values (38, 100, 'no');
    insert into patternDemo values (39, 100, 'no');
    insert into patternDemo values (40, 100, 'no');

     

  • Well this is definitely a head scratcher.  The following code is not pretty, but it seems to get the expected results

    WITH cteBase AS (
    SELECT *
    , rn = ROW_NUMBER() OVER ( ORDER BY pd.id )
    , IsStartBlock = SUM( CASE WHEN pd.VALUE > 1000 THEN 1 ELSE 0 END )
    OVER ( ORDER BY pd.id ROWS BETWEEN 9 PRECEDING AND CURRENT ROW )
    , IsEndBlock = SUM( CASE WHEN pd.VALUE < 1000 THEN 1 ELSE 0 END )
    OVER ( ORDER BY pd.id ROWS BETWEEN CURRENT ROW AND 9 FOLLOWING )
    FROM #patternDemo AS pd
    )
    , cteStart AS (
    SELECT sb.*
    , grp = sb.rn - ROW_NUMBER() OVER ( ORDER BY sb.rn )
    FROM cteBase AS sb
    WHERE sb.IsStartBlock IN ( 1, 10 )
    )
    , cteStartBlock AS (
    SELECT sb.StartBlockStart
    , StartBlockEnd = MAX( st.id )
    , rn = ROW_NUMBER() OVER (ORDER BY sb.StartBlockStart)
    FROM cteStart AS st
    OUTER APPLY ( SELECT StartBlockStart = MAX( sbs.id )
    FROM cteStart AS sbs
    WHERE sbs.IsStartBlock = 1
    AND sbs.id < st.id
    ) AS sb
    WHERE st.IsStartBlock = 10
    GROUP BY st.grp, sb.StartBlockStart
    )
    , cteEnd as (
    SELECT eb.*
    , grp = eb.rn - ROW_NUMBER() OVER ( ORDER BY eb.rn )
    FROM cteBase AS eb
    WHERE eb.IsEndBlock IN ( 1, 10 )
    )
    , cteEndBlock AS (
    SELECT EndBlockStart = MIN( nd.id )
    , eb.EndBlockEnd
    , rn = ROW_NUMBER() OVER (ORDER BY eb.EndBlockEnd)
    FROM cteEnd AS nd
    OUTER APPLY ( SELECT EndBlockEnd = MIN( ebs.id )
    FROM cteEnd AS ebs
    WHERE ebs.IsEndBlock = 1
    AND ebs.id > nd.id
    ) AS eb
    WHERE nd.IsEndBlock = 10
    GROUP BY nd.grp, eb.EndBlockEnd
    )
    , cteBlocks AS (
    SELECT BlockNo = scte.rn
    , scte.StartBlockStart
    , scte.StartBlockEnd
    , ecte.EndBlockStart
    , ecte.EndBlockEnd
    FROM cteStartBlock AS scte
    LEFT JOIN cteEndBlock AS ecte
    ON ecte.rn = scte.rn
    )
    SELECT bk.BlockNo
    , src.*
    FROM #patternDemo AS src
    INNER JOIN cteBlocks AS bk
    ON src.id > bk.StartBlockEnd
    AND src.id < bk.EndBlockStart
    ORDER BY src.id;
  • WITH Rules
    AS
    (

    SELECT id, [value]
    ,CASE
    WHEN 10 = SUM(CASE WHEN [value] >= 1000 THEN 1 ELSE 0 END)
    OVER (ORDER BY id ROWS BETWEEN CURRENT ROW AND 9 FOLLOWING)
    THEN 1
    ELSE 0
    END AS StartSignal
    ,CASE
    WHEN 10 = SUM(CASE WHEN [value] < 1000 THEN 1 ELSE 0 END)
    OVER (ORDER BY id ROWS BETWEEN CURRENT ROW AND 9 FOLLOWING)
    THEN 1
    ELSE 0
    END AS StopSignal
    FROM patternDemo
    )
    ,StartStop
    AS
    (
    SELECT id, [value]
    ,CASE
    WHEN 1 = LAG(StartSignal) OVER (ORDER BY id)
    THEN 0
    ELSE StartSignal
    END AS StartSignal
    ,CASE
    WHEN 1 = LAG(StopSignal) OVER (ORDER BY id)
    THEN 0
    ELSE StopSignal
    END AS StopSignal
    FROM Rules
    )
    ,StartWithStart
    AS
    (
    SELECT id, [value], StartSignal
    ,CASE
    WHEN LAG(StopSignal) OVER (ORDER BY id) IS NULL
    THEN 0
    ELSE StopSignal
    END AS StopSignal
    FROM StartStop
    )
    ,Includes
    AS
    (
    SELECT id, [value]
    ,SUM(StartSignal) OVER (ORDER BY id)
    - SUM(StopSignal) OVER (ORDER BY id) AS IncludeSignal
    FROM StartWithStart
    )
    SELECT id, [value]
    FROM Includes
    WHERE IncludeSignal = 1;
  • It seems safer to avoid LEAD and LAG because start/stop conditions might repeat within ranges of groups

    with 
    gap_cte as (
    select *, iif(v.sgn<>lag(v.sgn) over (order by id), 1, 0) gap
    from #patternDemo pd
    cross apply (values (sign([value]-1000))) v(sgn)),
    grp_cte as (
    select *, sum(gap) over (order by id) grp
    from gap_cte),
    rn_cte as (
    select *,
    row_number() over (partition by grp order by id desc) rn,
    row_number() over (partition by grp order by id) rn_max
    from grp_cte),
    start_stop_cte as (
    select top(1) with ties *
    from rn_cte
    where rn_max>=10
    order by rn)
    select g.*
    from start_stop_cte s
    outer apply (select top(1) grp
    from start_stop_cte ss
    where ss.id<s.id
    and ss.sgn=1
    order by id desc) starts(start_grp)
    left join grp_cte g on g.grp<s.grp
    and g.grp>=starts.start_grp
    where s.sgn=-1
    order by rn;

    Aus dem Paradies, das Cantor uns geschaffen, soll uns niemand vertreiben können

  • This should cope with more edge conditions than my original query:

    WITH Rules
    AS
    (
    SELECT id, [value]
    ,CASE
    WHEN 10 = SUM(CASE WHEN [value] >= 1000 THEN 1 ELSE 0 END)
    OVER (ORDER BY id ROWS BETWEEN CURRENT ROW AND 9 FOLLOWING)
    THEN 1
    ELSE 0
    END AS StartSignal
    ,CASE
    WHEN 10 = SUM(CASE WHEN [value] < 1000 THEN 1 ELSE 0 END)
    OVER (ORDER BY id ROWS BETWEEN CURRENT ROW AND 9 FOLLOWING)
    THEN 1
    ELSE 0
    END AS StopSignal
    FROM patternDemo
    )
    ,StartStop
    AS
    (
    SELECT id, [value]
    ,CASE
    WHEN 1 = LAG(StartSignal) OVER (ORDER BY id)
    THEN 0
    ELSE StartSignal
    END AS StartSignal
    ,CASE
    WHEN 1 = LAG(StopSignal) OVER (ORDER BY id)
    THEN 0
    ELSE StopSignal
    END AS StopSignal
    ,MAX(StartSignal) OVER (ORDER BY id)
    -MAX(StopSignal) OVER (ORDER BY id) AS StartStopDiff
    FROM Rules
    )
    ,StartWithStart
    AS
    (
    SELECT id, [value], StartSignal
    ,CASE
    WHEN StartStopDiff = -1
    THEN 0
    ELSE StopSignal
    END AS StopSignal
    FROM StartStop
    )
    ,Includes
    AS
    (
    SELECT id, [value]
    ,SUM(StartSignal) OVER (ORDER BY id)
    - SUM(StopSignal) OVER (ORDER BY id) AS IncludeSignal
    FROM StartWithStart
    )
    SELECT id, [value], IncludeSignal
    FROM Includes
    WHERE IncludeSignal >= 1;

    • This reply was modified 4 months, 1 week ago by  Ken McKelvey.
    • This reply was modified 4 months, 1 week ago by  Ken McKelvey.
  • On looking at this again this may be a safer approach:

    WITH Rules
    AS
    (
    SELECT id, [value]
    ,CASE
    WHEN 10 = SUM(CASE WHEN [value] >= 1000 THEN 1 ELSE 0 END)
    OVER (ORDER BY id ROWS BETWEEN CURRENT ROW AND 9 FOLLOWING)
    THEN 1
    ELSE 0
    END AS StartSignal
    ,CASE
    WHEN 10 = SUM(CASE WHEN [value] < 1000 THEN 1 ELSE 0 END)
    OVER (ORDER BY id ROWS BETWEEN CURRENT ROW AND 9 FOLLOWING)
    THEN 1
    ELSE 0
    END AS StopSignal
    FROM patternDemo
    )
    ,PKOnOffs
    AS
    (
    SELECT id, [value]
    ,CASE
    WHEN StartSignal = 1
    THEN RIGHT('000000000' + CAST(id AS varchar(20)), 10) + 'On'
    WHEN StopSignal = 1
    THEN RIGHT('000000000' + CAST(id AS varchar(20)), 10) + 'Off'
    END AS PKOnOff
    FROM Rules
    )
    ,OnOffs
    AS
    (
    SELECT id, [value]
    -- MAX ignores NULLs
    ,ISNULL(SUBSTRING(MAX(PKOnOff) OVER (ORDER BY id), 11, 3), 'Off') AS OnOff
    FROM PKOnOffs
    )
    SELECT id, [value]
    FROM OnOffs
    WHERE OnOff = 'On';

Viewing 6 posts - 1 through 5 (of 5 total)

You must be logged in to reply to this topic. Login to reply