January 1, 2017 at 12:11 am
Have a problem that has me stumped. MSSQL 2014. The problem is this. Have a table storing id, date, occurrence(bit field).
id can be repeated. The date for the same id could be repeated. Some rows are an occurrence and some are not. Each id could have a variety of different dates, and what I need to know is for each id if at any point in any rolling 12 month period unique to that id from the date of the last occurrence if there are 4 or more occurrences. It would be possible for each id to pass that occurrence count based on the dates multiple times within a period of time using previous occurrences if they fell within a rolling 12 month period.
id date occurrence
1 1/1/15 1
1 1/2/15 0
2 1/31/15 1
3 2/1/15 1
4 2/14/15 1
5 2/26/15 1
2 4/1/15 1
5 4/5/15 1
5 4/6/15 0
1 4/9/15 1
1 5/16/15 1
1 6/16/15 1
1 10/5/15 1
1 11/6/15 1
1 1/16/16 1
1 2/24/16 1
So in dataset above id "1" had 4 or more occurrences 1/1/15,4/9/15,5/16/15,6/16/15 and in fact had 6 total within 12 months including 10/5/15 and 11/6/15. So I need to always figure from the newest date that is an occurrence in the table and if in the 12 months prior to that did the id reach the occurrence trigger.
Any help on the T-SQL would be appreciated.
Thanks
January 1, 2017 at 3:35 am
Hi and welcome to the forum. Here are two possible solutions, the first uses conventional aggregation and the second one uses the window function aggregation, the former seems to perform better.
😎
First a test data set
USE TEEST;
GO
SET NOCOUNT ON;
--/* -- UNCOMMENT THIS LINE TO SKIP THE SAMPLE DATA SET CREATION
---- START SAMPLE DATA SET
IF OBJECT_ID(N'dbo.TBL_TEST_OCCURRENCE') IS NOT NULL DROP TABLE dbo.TBL_TEST_OCCURRENCE;
CREATE TABLE dbo.TBL_TEST_OCCURRENCE
(
TOC_RID INT NOT NULL CONSTRAINT PK_dbo_TBL_TEST_OCCURRENCE_TOC_RID PRIMARY KEY CLUSTERED
,TOC_ID INT NOT NULL
,TOC_DATE DATE NOT NULL
,TOC_OCCURR BIT NOT NULL
);
DECLARE @SAMPLE_SIZE INT = 1000000;
DECLARE @ID_COUNT INT = 250000;
DECLARE @OCC_RATIO INT = 3;
DECLARE @FIRST_DATE DATE = CONVERT(DATE,'20150101',112);
DECLARE @DATE_RANGE INT = 720;
;WITH T(N) AS (SELECT X.N FROM (VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0))X(N))
, NUMS(N) AS (SELECT TOP(@SAMPLE_SIZE) ROW_NUMBER() OVER (ORDER BY @@VERSION) AS N
FROM T T1,T T2,T T3,T T4,T T5,T T6,T T7,T T8,T T9)
INSERT INTO dbo.TBL_TEST_OCCURRENCE WITH (TABLOCK) (TOC_RID,TOC_ID,TOC_DATE,TOC_OCCURR)
SELECT
NM.N
,ABS(CHECKSUM(NEWID())) % @ID_COUNT
,DATEADD(DAY,(CHECKSUM(NEWID())) % @DATE_RANGE,@FIRST_DATE)
,SIGN(ABS(CHECKSUM(NEWID())) % @OCC_RATIO)
FROM NUMS NM;
--POC INDEX
CREATE NONCLUSTERED INDEX NCLIDC_DBO_TBL_TEST_OCCURRENCE_TOC_ID_TOC_DATE_INCL_TOC_OCCURR ON dbo.TBL_TEST_OCCURRENCE(TOC_ID,TOC_DATE) INCLUDE (TOC_OCCURR) --WHERE (TOC_OCCURR = 1);
---- END SAMPLE DATA SET
-- */
The solutions
-- CONTROL VARIABLES
DECLARE @MIN_OCCURR INT = 4;
DECLARE @YEAR_COUNT INT = 1;
-- SOLUTION CODE W. GROUP BY
;WITH LATEST_DATE_COUNT AS
(
SELECT
TOC.TOC_ID AS TOC_ID
,DATEADD(YEAR,-@YEAR_COUNT,MAX(TOC.TOC_DATE)) AS FIRST_DATE
,MAX(TOC.TOC_DATE) AS LATEST_DATE
FROM dbo.TBL_TEST_OCCURRENCE TOC
GROUP BY TOC.TOC_ID
)
SELECT
TC.TOC_ID
,SUM(CONVERT(INT,TC.TOC_OCCURR,0)) AS OCC_COUNT
,LDC.FIRST_DATE
,LDC.LATEST_DATE
FROM dbo.TBL_TEST_OCCURRENCE TC
CROSS APPLY LATEST_DATE_COUNT LDC
WHERE TC.TOC_ID = LDC.TOC_ID
AND TC.TOC_DATE >= LDC.FIRST_DATE
AND TC.TOC_DATE <= LDC.LATEST_DATE
AND TC.TOC_OCCURR = 1
GROUP BY
TC.TOC_ID
,LDC.FIRST_DATE
,LDC.LATEST_DATE
HAVING SUM(CONVERT(INT,TC.TOC_OCCURR,0)) >= @MIN_OCCURR
ORDER BY TC.TOC_ID
OPTION (MAXDOP 1)
;
--WINDOW FUNCTION SOLUTION
;WITH OCCURR_FLAGGED AS
(
SELECT
TOC.TOC_ID
,TOC.TOC_DATE
,MAX(TOC.TOC_DATE) OVER
(
PARTITION BY TOC.TOC_ID
) AS LATEST_DATE
,CASE
WHEN TOC.TOC_DATE >= DATEADD(YEAR,-@YEAR_COUNT,MAX(TOC.TOC_DATE) OVER (PARTITION BY TOC.TOC_ID)) AND TOC.TOC_OCCURR = 1 THEN 1
ELSE 0
END AS VALID_OCCURR
FROM dbo.TBL_TEST_OCCURRENCE TOC
)
,COUNTING_SET AS
(
SELECT
OCF.TOC_ID
,ROW_NUMBER() OVER
(
PARTITION BY OCF.TOC_ID
ORDER BY @@VERSION
) AS OCF_RID
,SUM(OCF.VALID_OCCURR) OVER (PARTITION BY OCF.TOC_ID) AS OCC_COUNT
,DATEADD(YEAR,-@YEAR_COUNT,OCF.LATEST_DATE) AS FIRST_DATE
,OCF.LATEST_DATE
FROM OCCURR_FLAGGED OCF
)
SELECT
CS.TOC_ID
,CS.OCC_COUNT
,CS.FIRST_DATE
,CS.LATEST_DATE
FROM COUNTING_SET CS
WHERE CS.OCF_RID = 1
AND CS.OCC_COUNT >= @MIN_OCCURR
OPTION (MAXDOP 1)
;
January 1, 2017 at 11:25 am
Here is another solution. I didn't benchmark this solution against the million row table, but I expect it to perform more efficiently based on simplicity even if the POC index isn't in place:
--truncate table TBL_TEST_OCCURRENCE
IF OBJECT_ID(N'dbo.TBL_TEST_OCCURRENCE') IS NOT NULL DROP TABLE dbo.TBL_TEST_OCCURRENCE;
CREATE TABLE dbo.TBL_TEST_OCCURRENCE
(
TOC_ID INT NOT NULL
,TOC_DATE DATE NOT NULL
,TOC_OCCURR BIT NOT NULL
);
insert TBL_TEST_OCCURRENCE
values (1, '1/1/15', 1)
insert TBL_TEST_OCCURRENCE
values (1, '1/2/15', 0)
insert TBL_TEST_OCCURRENCE
values (2, '1/31/15', 1)
insert TBL_TEST_OCCURRENCE
values (3, '2/1/15', 1 )
insert TBL_TEST_OCCURRENCE
values (4, '2/14/15', 1)
insert TBL_TEST_OCCURRENCE
values (5, '2/26/15', 1)
insert TBL_TEST_OCCURRENCE
values (2, '4/1/15', 1)
insert TBL_TEST_OCCURRENCE
values (5, '4/5/15', 1)
insert TBL_TEST_OCCURRENCE
values (5, '4/6/15', 0)
insert TBL_TEST_OCCURRENCE
values (1, '4/9/15', 1)
insert TBL_TEST_OCCURRENCE
values (1, '5/16/15', 1)
insert TBL_TEST_OCCURRENCE
values (1, '6/16/15', 1)
insert TBL_TEST_OCCURRENCE
values (1, '10/5/15', 1)
insert TBL_TEST_OCCURRENCE
values (1, '11/6/15', 1)
insert TBL_TEST_OCCURRENCE
values (1, '1/16/16', 1)
insert TBL_TEST_OCCURRENCE
values (1, '2/24/16', 1)
--helpful index
CREATE CLUSTERED INDEX cidx ON dbo.TBL_TEST_OCCURRENCE(TOC_ID,TOC_DATE,TOC_OCCURR) --WHERE (TOC_OCCURR = 1);
select * from tbl_test_occurrence
TOC_ID TOC_DATE TOC_OCCURR
----------- ---------- ----------
1 2015-01-01 1
1 2015-01-02 0
1 2015-04-09 1
1 2015-05-16 1
1 2015-06-16 1
1 2015-10-05 1
1 2015-11-06 1
1 2016-01-16 1
1 2016-02-24 1
2 2015-01-31 1
2 2015-04-01 1
3 2015-02-01 1
4 2015-02-14 1
5 2015-02-26 1
5 2015-04-05 1
5 2015-04-06 0
/* if we filter on just TOC_OCCURR = 1 that gets only the set of rows we are interested in
then we check the prior 3 rows and compare those dates to the current date, flagging those that are within one year of current date
*/
SELECT TOC_ID
,TOC_DATE
,CASE WHEN LAG(TOC_DATE, 1) OVER (PARTITION BY TOC_ID ORDER BY TOC_DATE) > DATEADD(YEAR, -1, TOC_DATE) THEN 1 ELSE 0 END AS Cnt1
,CASE WHEN LAG(TOC_DATE, 2) OVER (PARTITION BY TOC_ID ORDER BY TOC_DATE) > DATEADD(YEAR, -1, TOC_DATE) THEN 1 ELSE 0 END AS Cnt2
,CASE WHEN LAG(TOC_DATE, 3) OVER (PARTITION BY TOC_ID ORDER BY TOC_DATE) > DATEADD(YEAR, -1, TOC_DATE) THEN 1 ELSE 0 END AS Cnt3
FROM tbl_test_occurrence
WHERE TOC_OCCURR = 1
TOC_ID TOC_DATE Cnt1 Cnt2 Cnt3
----------- ---------- ----------- ----------- -----------
1 2015-01-01 0 0 0
1 2015-04-09 1 0 0
1 2015-05-16 1 1 0
1 2015-06-16 1 1 1
1 2015-10-05 1 1 1
1 2015-11-06 1 1 1
1 2016-01-16 1 1 1
1 2016-02-24 1 1 1
2 2015-01-31 0 0 0
2 2015-04-01 1 0 0
3 2015-02-01 0 0 0
4 2015-02-14 0 0 0
5 2015-02-26 0 0 0
5 2015-04-05 1 0 0
/* Now I think the answer is trivial. Since we are only including rows that are an occurrence,
for 3-back row to be within one year of current row means there must be 2 further rows
between that point and the current row. Those 3 rows plus the current one add up to four
*/
;WITH cte as
( SELECT TOC_ID
,TOC_DATE
,CASE WHEN LAG(TOC_DATE, 3) OVER (PARTITION BY TOC_ID ORDER BY TOC_DATE) > DATEADD(YEAR, -1, TOC_DATE) THEN 1 ELSE 0 END AS Cnt3
FROM tbl_test_occurrence
WHERE TOC_OCCURR = 1)
SELECT *
FROM cte
WHERE Cnt3 = 1
TOC_ID TOC_DATE Cnt3
----------- ---------- -----------
1 2015-06-16 1
1 2015-10-05 1
1 2015-11-06 1
1 2016-01-16 1
1 2016-02-24 1
/* lets add in some more test rows to validate. clearly adding in 0 occurrence rows won't
matter in the output since we are filtering on TOC_OCCURR = 1 */
insert TBL_TEST_OCCURRENCE
values (1, '11/08/16', 1) --has only THREE rows in prior year (including itself): 2016-01-16 and 2016-02-24
insert TBL_TEST_OCCURRENCE
values (1, '01/18/17', 1) --has only THREE rows in prior year (including itself): 11/08/16 and 2016-02-24
insert TBL_TEST_OCCURRENCE
values (1, '02/18/17', 1) --has FOUR rows in prior year (including itself): 11/08/16, 01/18/17 and 2016-02-24
--select * from tbl_test_occurrence
TOC_ID TOC_DATE TOC_OCCURR
----------- ---------- ----------
1 2015-01-01 1
1 2015-01-02 0
1 2015-04-09 1
1 2015-05-16 1
1 2015-06-16 1
1 2015-10-05 1
1 2015-11-06 1
1 2016-01-16 1
1 2016-02-24 1
1 2016-11-08 1
1 2017-01-18 1
1 2017-02-18 1
2 2015-01-31 1
2 2015-04-01 1
3 2015-02-01 1
4 2015-02-14 1
5 2015-02-26 1
5 2015-04-05 1
5 2015-04-06 0
;WITH cte as
( SELECT TOC_ID
,TOC_DATE
,CASE WHEN LAG(TOC_DATE, 3) OVER (PARTITION BY TOC_ID ORDER BY TOC_DATE) > DATEADD(YEAR, -1, TOC_DATE) THEN 1 ELSE 0 END AS Cnt3
FROM tbl_test_occurrence
WHERE TOC_OCCURR = 1)
SELECT *
FROM cte
WHERE Cnt3 = 1
TOC_ID TOC_DATE Cnt3
----------- ---------- -----------
1 2015-06-16 1
1 2015-10-05 1
1 2015-11-06 1
1 2016-01-16 1
1 2016-02-24 1
1 2017-02-18 1
--prove that partitioning works for TOC_ID
insert TBL_TEST_OCCURRENCE
values (2, '4/9/15', 1)
insert TBL_TEST_OCCURRENCE
values (2, '5/16/15', 1)
insert TBL_TEST_OCCURRENCE
values (2, '6/16/15', 1)
insert TBL_TEST_OCCURRENCE
values (2, '10/5/15', 1)
insert TBL_TEST_OCCURRENCE
values (2, '11/6/15', 1)
insert TBL_TEST_OCCURRENCE
values (2, '1/16/16', 1)
insert TBL_TEST_OCCURRENCE
values (2, '2/24/16', 1)
;WITH cte as
( SELECT TOC_ID
,TOC_DATE
,CASE WHEN LAG(TOC_DATE, 3) OVER (PARTITION BY TOC_ID ORDER BY TOC_DATE) > DATEADD(YEAR, -1, TOC_DATE) THEN 1 ELSE 0 END AS Cnt3
FROM tbl_test_occurrence
WHERE TOC_OCCURR = 1)
SELECT *
FROM cte
WHERE Cnt3 = 1
--cleanup
DROP TABLE tbl_test_occurrence
Best,
Kevin G. Boles
SQL Server Consultant
SQL MVP 2007-2012
TheSQLGuru on googles mail service
January 2, 2017 at 10:01 am
mightycaptain (1/1/2017)
Have a problem that has me stumped. MSSQL 2014. The problem is this. Have a table storing id, date, occurrence(bit field).id can be repeated. The date for the same id could be repeated. Some rows are an occurrence and some are not. Each id could have a variety of different dates, and what I need to know is for each id if at any point in any rolling 12 month period unique to that id from the date of the last occurrence if there are 4 or more occurrences. It would be possible for each id to pass that occurrence count based on the dates multiple times within a period of time using previous occurrences if they fell within a rolling 12 month period.
id date occurrence
1 1/1/15 1
1 1/2/15 0
2 1/31/15 1
3 2/1/15 1
4 2/14/15 1
5 2/26/15 1
2 4/1/15 1
5 4/5/15 1
5 4/6/15 0
1 4/9/15 1
1 5/16/15 1
1 6/16/15 1
1 10/5/15 1
1 11/6/15 1
1 1/16/16 1
1 2/24/16 1
So in dataset above id "1" had 4 or more occurrences 1/1/15,4/9/15,5/16/15,6/16/15 and in fact had 6 total within 12 months including 10/5/15 and 11/6/15. So I need to always figure from the newest date that is an occurrence in the table and if in the 12 months prior to that did the id reach the occurrence trigger.
Any help on the T-SQL would be appreciated.
Thanks
I also didn't benchmark against the million rows thing because I just don't have the time, but this seemed rather simple to create. Here's my result, but you'll probably want to index the table appropriately:
CREATE TABLE dbo.OCCURRENCES (
id int,
occurred_date date,
occurrence tinyint
);
INSERT INTO dbo.OCCURRENCES(id, occurred_date, occurrence)
VALUES(1, '01/01/15', 1),
(1, '01/02/15', 0),
(2, '01/31/15', 1),
(3, '02/01/15', 1),
(4, '02/14/15', 1),
(5, '02/26/15', 1),
(2, '04/01/15', 1),
(5, '04/05/15', 1),
(5, '04/06/15', 0),
(1, '04/09/15', 1),
(1, '05/16/15', 1),
(1, '06/16/15', 1),
(1, '10/05/15', 1),
(1, '11/06/15', 1),
(1, '01/16/16', 1),
(1, '02/24/16', 1);
WITH LAST_OCCURRENCE AS (
SELECT id, MAX(occurred_date) AS MAX_OCCURRED_DATE, DATEADD(day, 1, DATEADD(year, -1, MAX(occurred_date))) AS WINDOW_START_DATE
FROM dbo.OCCURRENCES
GROUP BY id
)
SELECT O.id, L.WINDOW_START_DATE, L.MAX_OCCURRED_DATE, SUM(O.occurrence) AS NUMBER_OF_OCCURRENCES
FROM dbo.OCCURRENCES AS O
INNER JOIN LAST_OCCURRENCE AS L
ON O.id = L.id
AND O.occurred_date BETWEEN L.WINDOW_START_DATE AND L.MAX_OCCURRED_DATE
GROUP BY O.id, L.WINDOW_START_DATE, L.MAX_OCCURRED_DATE
HAVING SUM(occurrence) > 3
ORDER BY O.id;
Let me know if it works, as well as how it performs, and what, if any, indexes, you end up with.
Steve (aka sgmunson) 🙂 🙂 🙂
Rent Servers for Income (picks and shovels strategy)
Viewing 4 posts - 1 through 4 (of 4 total)
You must be logged in to reply to this topic. Login to reply