July 26, 2015 at 9:51 am
I am writing some scripts to calculate mean (avg) stdevs count and percentile_cont for some data
I am partitioning over multiple columns e.g. partition by year,month, day order by value
My question is am I performing unnecessary overhead as I throw away all but the last (or max) value in each window. (I use cte then select max from cte)
That is I only care about the avg of the window in its entirety not specific subwindows.
I traditionally used group by.
What do you suggest
Hopefully question is clear.
July 26, 2015 at 10:00 am
tct_sql_admin (7/26/2015)
I am writing some scripts to calculate mean (avg) stdevs count and percentile_cont for some dataI am partitioning over multiple columns e.g. partition by year,month, day order by value
My question is am I performing unnecessary overhead as I throw away all but the last (or max) value in each window. (I use cte then select max from cte)
That is I only care about the avg of the window in its entirety not specific subwindows.
I traditionally used group by.
What do you suggest
Hopefully question is clear.
Quick question, can you post DDL (create table) script, sample data in the form of an insert statement, what you have tried so far and the desired results?
😎
To answer the question, it depends.
July 26, 2015 at 10:38 am
Thank you for taking your time to look at this.
I have knocked up the below.
Obviously the CAST to DATETIME is redundant, but my real data can be stored as non DATETIME type.
CREATE TABLE #TEMP_DATA (
DATE_TIME DATETIME,
VALUE INT)
DECLARE @RowCount INT
DECLARE @random INT, @UpperDate INT, @LowerDate INT
DECLARE @LowerVal INT, @UpperVal INT
DECLARE @InsertDate DATETIME
DECLARE @RowVal INT
SET @LowerDate = -730
SET @UpperDate = -1
SET @LowerVal = 0
SET @UpperVal = 1000000
SET @RowCount = 0
WHILE @RowCount < 100000
BEGIN
SET @RowVal = ROUND(((@UpperVal - @LowerVal -1) * RAND() + @LowerVal), 0)
SELECT @random = ROUND(((@UpperDate - @LowerDate -1) * RAND() + @LowerDate), 0)
SET @InsertDate = DATEADD(dd, @random, GETDATE())
INSERT INTO #TEMP_DATA
(DATE_TIME
,VALUE)
VALUES
(@InsertDate, @RowVal)
SET @RowCount = @RowCount + 1
END
SELECT TOP 100 * FROM #TEMP_DATA
WITH CTE AS (
SELECT
DATEPART(YEAR,CAST(LEFT(DATE_TIME, 19) as datetime)) [YEAR]
,DATEPART(MONTH,CAST(LEFT(DATE_TIME, 19) as datetime)) [MONTH]
,DATEPART(DAY,CAST(LEFT(DATE_TIME, 19) as datetime)) [DAY]
,COUNT(VALUE) OVER (PARTITION BY DATEPART(YEAR,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(MONTH,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(DAY,CAST(LEFT(DATE_TIME, 19) as datetime))
) [COUNT]
,AVG(VALUE) OVER (PARTITION BY DATEPART(YEAR,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(MONTH,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(DAY,CAST(LEFT(DATE_TIME, 19) as datetime))
) [AVG_VAL]
,STDEV(VALUE) OVER (PARTITION BY DATEPART(YEAR,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(MONTH,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(DAY,CAST(LEFT(DATE_TIME, 19) as datetime))
) [STDEV_VAL]
,PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY VALUE)
OVER (PARTITION BY DATEPART(YEAR,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(MONTH,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(DAY,CAST(LEFT(DATE_TIME, 19) as datetime))
) [MEDIAN]
,PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY VALUE)
OVER (PARTITION BY DATEPART(YEAR,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(MONTH,CAST(LEFT(DATE_TIME, 19) as datetime))
,DATEPART(DAY,CAST(LEFT(DATE_TIME, 19) as datetime))
) [99THPERCENTILE]
FROM #TEMP_DATA
)
SELECT
[YEAR]
,[MONTH]
,[DAY]
,MAX([COUNT]) [NO_OF_SAMPLES],
MAX(AVG_VAL) [AVG],
MAX(STDEV_VAL) [STD_DEV],
MAX(MEDIAN) [MEDIAN],
MAX([99THPERCENTILE]) [99THPERCENTILE]
FROM CTE
GROUP BY
[YEAR],[MONTH],[DAY]
ORDER BY
[YEAR],[MONTH],[DAY]
July 26, 2015 at 4:58 pm
The problem here aren't the Window functions as such but the way they are being used. The query requires the set to be sorted 4 times and each sort likely to spill into tempdb.
😎
Here is a quick modification to the original query, roughly 3 times quicker and half of the worktable I/O. The key to this is to have a sorted column with the right granularity to fit the Window functions order and partitions.
Obviously the outputs are identical.
;WITH BASE_DATA AS
(
SELECT
CAST(LEFT(TD.DATE_TIME, 19) as DATE) AS [DATE]
,TD.VALUE
FROM #TEMP_DATA TD
)
,CALCULATED_SET AS
(
SELECT
BD.[DATE]
,COUNT(BD.VALUE) OVER (PARTITION BY BD.[DATE]) AS [COUNT]
,AVG(BD.VALUE) OVER (PARTITION BY BD.[DATE]) AS [AVG_VAL]
,STDEV(BD.VALUE) OVER (PARTITION BY BD.[DATE]) AS [STDEV_VAL]
,PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY BD.VALUE ASC) OVER (PARTITION BY BD.[DATE]) AS [MEDIAN]
,PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY BD.VALUE ASC) OVER (PARTITION BY BD.[DATE]) AS [99THPERCENTILE]
FROM BASE_DATA BD
)
,FINAL_SET AS
(
SELECT
CS.[DATE]
,MAX(CS.[COUNT] ) AS [COUNT]
,MAX(CS.[AVG_VAL] ) AS [AVG_VAL]
,MAX(CS.[STDEV_VAL] ) AS [STDEV_VAL]
,MAX(CS.[MEDIAN] ) AS [MEDIAN]
,MAX(CS.[99THPERCENTILE]) AS [99THPERCENTILE]
FROM CALCULATED_SET CS
GROUP BY CS.[DATE]
)
SELECT
YEAR(FS.[DATE]) AS [YEAR]
,MONTH(FS.[DATE]) AS [MONTH]
,DAY(FS.[DATE]) AS [DAY]
,FS.[COUNT] AS [NO_OF_SAMPLES]
,FS.AVG_VAL AS [AVG]
,FS.STDEV_VAL AS [STD_DEV]
,FS.MEDIAN AS [MEDIAN]
,FS.[99THPERCENTILE] AS [99THPERCENTILE]
FROM FINAL_SET FS;
Edit: typo
July 27, 2015 at 3:07 am
This is a fantastic reply
The key to this is to have a sorted column with the right granularity to fit the Window functions order and partitions.
I am trying to understand this principle :-).
So let me change the scenario a bit and add more columns to partition by, including by hour for date.
I have attempted to use your template but feel I am missing the point.
Here is the data
DROP TABLE #TEMP_DATA
CREATE TABLE #TEMP_DATA (
DATE_TIME DATETIME,
SURNAME NVARCHAR(MAX),
FIRSTNAME NVARCHAR(MAX),
OCCUPATION NVARCHAR(MAX),
[SHIFT] NVARCHAR(MAX),
VALUE INT)
DECLARE @RowCount INT
DECLARE @Random INT, @UpperDate INT, @LowerDate INT
DECLARE @LowerVal INT, @UpperVal INT
DECLARE @InsertDate DATETIME
DECLARE @RowVal INT
SET @LowerDate = -730
SET @UpperDate = -1
SET @LowerVal = 0
SET @UpperVal = 1000000
SET @RowCount = 0
WHILE @RowCount < 10000
BEGIN
SET @RowVal = ROUND(((@UpperVal - @LowerVal -1) * RAND() + @LowerVal), 0)
SELECT @Random = ((@UpperDate - @LowerDate -1) * RAND() + @LowerDate)
SET @InsertDate = DATEADD(dd, @Random, GETDATE())
INSERT INTO #TEMP_DATA
(DATE_TIME, SURNAME, FIRSTNAME, OCCUPATION, [SHIFT]
,VALUE)
VALUES
(@InsertDate,'ADAMS', 'JOHN', 'CARPENTER', 'NIGHT', @RowVal)
SET @RowCount = @RowCount + 1
END
SET @RowCount = 0
WHILE @RowCount < 10000
BEGIN
SET @RowVal = ROUND(((@UpperVal - @LowerVal -1) * RAND() + @LowerVal), 0)
SELECT @Random = ((@UpperDate - @LowerDate -1) * RAND() + @LowerDate)
SET @InsertDate = DATEADD(dd, @Random, GETDATE())
INSERT INTO #TEMP_DATA
(DATE_TIME, SURNAME, FIRSTNAME, OCCUPATION, [SHIFT]
,VALUE)
VALUES
(@InsertDate,'BOSTOCK', 'ALFRED', 'PLUMBER', 'DAY', @RowVal)
SET @RowCount = @RowCount + 1
END
SET @RowCount = 0
WHILE @RowCount < 10000
BEGIN
SET @RowVal = ROUND(((@UpperVal - @LowerVal -1) * RAND() + @LowerVal), 0)
SELECT @Random = ((@UpperDate - @LowerDate -1) * RAND() + @LowerDate)
SET @InsertDate = DATEADD(dd, @Random, GETDATE())
INSERT INTO #TEMP_DATA
(DATE_TIME, SURNAME, FIRSTNAME, OCCUPATION, [SHIFT]
,VALUE)
VALUES
(@InsertDate,'CARMICHAEL', 'MICHAEL', 'DENTIST', 'EARLY', @RowVal)
SET @RowCount = @RowCount + 1
END
SET @RowCount = 0
WHILE @RowCount < 10000
BEGIN
SET @RowVal = ROUND(((@UpperVal - @LowerVal -1) * RAND() + @LowerVal), 0)
SELECT @Random = ((@UpperDate - @LowerDate -1) * RAND() + @LowerDate)
SET @InsertDate = DATEADD(dd, @Random, GETDATE())
INSERT INTO #TEMP_DATA
(DATE_TIME, SURNAME, FIRSTNAME, OCCUPATION, [SHIFT]
,VALUE)
VALUES
(@InsertDate,'CARMICHAEL', 'MICHAEL', 'DENTIST', 'DAY', @RowVal)
SET @RowCount = @RowCount + 1
END
SET @RowCount = 0
WHILE @RowCount < 10000
BEGIN
SET @RowVal = ROUND(((@UpperVal - @LowerVal -1) * RAND() + @LowerVal), 0)
SELECT @Random = ((@UpperDate - @LowerDate -1) * RAND() + @LowerDate)
SET @InsertDate = DATEADD(dd, @Random, GETDATE())
INSERT INTO #TEMP_DATA
(DATE_TIME, SURNAME, FIRSTNAME, OCCUPATION, [SHIFT]
,VALUE)
VALUES
(@InsertDate,'CARMICHAEL', 'MICHAEL', 'DENTIST', 'NIGHT', @RowVal)
SET @RowCount = @RowCount + 1
END
SELECT TOP 100 * FROM #TEMP_DATA
Here is your template to query this with Windows functions:
;WITH BASE_DATA AS
(
SELECT
SURNAME, FIRSTNAME, OCCUPATION, [SHIFT]
,CAST(LEFT(TD.DATE_TIME, 19) as DATE) AS [DATE]
,DATEPART(HOUR,CAST(LEFT(TD.DATE_TIME, 19) as datetime)) [HOUR]
,TD.VALUE
FROM #TEMP_DATA TD
)
,CALCULATED_SET AS
(
SELECT
SURNAME, FIRSTNAME, OCCUPATION,
BD.[DATE],[HOUR]
,COUNT(BD.VALUE) OVER (PARTITION BY SURNAME, FIRSTNAME, OCCUPATION,[SHIFT],BD.[DATE],[HOUR]) AS [COUNT]
,AVG(BD.VALUE) OVER (PARTITION BY SURNAME, FIRSTNAME, OCCUPATION,[SHIFT],BD.[DATE],[HOUR]) AS [AVG_VAL]
,STDEV(BD.VALUE) OVER (PARTITION BY SURNAME, FIRSTNAME, OCCUPATION,[SHIFT],BD.[DATE],[HOUR]) AS [STDEV_VAL]
,PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY BD.VALUE ASC) OVER (PARTITION BY SURNAME, FIRSTNAME, OCCUPATION,[SHIFT],BD.[DATE],[HOUR]) AS [MEDIAN]
,PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY BD.VALUE ASC) OVER (PARTITION BY SURNAME, FIRSTNAME, OCCUPATION,[SHIFT],BD.[DATE],[HOUR]) AS [99THPERCENTILE]
FROM BASE_DATA BD
)
,FINAL_SET AS
(
SELECT
SURNAME, FIRSTNAME, OCCUPATION,[SHIFT]
,CS.[DATE], [HOUR]
,MAX(CS.[COUNT] ) AS [COUNT]
,MAX(CS.[AVG_VAL] ) AS [AVG_VAL]
,MAX(CS.[STDEV_VAL] ) AS [STDEV_VAL]
,MAX(CS.[MEDIAN] ) AS [MEDIAN]
,MAX(CS.[99THPERCENTILE]) AS [99THPERCENTILE]
FROM CALCULATED_SET CS
GROUP BY SURNAME, FIRSTNAME, OCCUPATION,[SHIFT],CS.[DATE],[HOUR]
)
SELECT
SURNAME, FIRSTNAME, OCCUPATION, [SHIFT]
,YEAR(FS.[DATE]) AS [YEAR]
,MONTH(FS.[DATE]) AS [MONTH]
,DAY(FS.[DATE]) AS [DAY]
,[HOUR]AS [HOUR]
,FS.[COUNT] AS [NO_OF_SAMPLES]
,FS.AVG_VAL AS [AVG]
,FS.STDEV_VAL AS [STD_DEV]
,FS.MEDIAN AS [MEDIAN]
,FS.[99THPERCENTILE] AS [99THPERCENTILE]
FROM FINAL_SET FS;
July 27, 2015 at 7:24 am
Here is the previous query adjusted to the changes, it uses HASHBYTES but it depends on the width of the columns whether it's beneficial or not. The plan has two sort operators, one in the beginning which sort order is used until the very end when the second sort is introduced at almost no cost.
😎
USE tempdb;
GO
SET NOCOUNT ON;
GO
;WITH DATE_TIME_CONVERSION AS
(
SELECT
CONVERT(DATETIME,TD.[DATE_TIME],121) AS [DATE_TIME]
,TD.[SURNAME]
,TD.[FIRSTNAME]
,TD.[OCCUPATION]
,TD.[SHIFT]
,TD.[VALUE]
FROM #TEMP_DATA TD
)
,BASE_DATA AS
(
SELECT
DTC.DATE_TIME
,(CONVERT(INT,CONVERT(VARCHAR(8),DTC.DATE_TIME,112),0) * 100) + DATEPART(HOUR,(DTC.DATE_TIME)) AS DT_INT
,HASHBYTES('SHA1',CONCAT((CONVERT(INT,CONVERT(VARCHAR(8),DTC.DATE_TIME,112),0) * 100) + DATEPART(HOUR,(DTC.DATE_TIME))
,CONCAT
(
DTC.[SURNAME]
,NCHAR(124)
,DTC.[FIRSTNAME]
,NCHAR(124)
,DTC.[OCCUPATION]
,NCHAR(124)
,DTC.[SHIFT]
))) AS WFPOCOL
,DTC.[SURNAME]
,DTC.[FIRSTNAME]
,DTC.[OCCUPATION]
,DTC.[SHIFT]
,DTC.[VALUE]
FROM DATE_TIME_CONVERSION DTC
)
,CALCULATED_SET AS
(
SELECT
BD.SURNAME
,BD.FIRSTNAME
,BD.OCCUPATION
,BD.[SHIFT]
,BD.DATE_TIME
,BD.DT_INT
,COUNT(BD.VALUE) OVER (PARTITION BY BD.WFPOCOL) AS [COUNT]
,AVG(BD.VALUE) OVER (PARTITION BY BD.WFPOCOL) AS [AVG_VAL]
,STDEV(BD.VALUE) OVER (PARTITION BY BD.WFPOCOL) AS [STDEV_VAL]
,PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY BD.VALUE ASC) OVER (PARTITION BY BD.WFPOCOL) AS [MEDIAN]
,PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY BD.VALUE ASC) OVER (PARTITION BY BD.WFPOCOL) AS [99THPERCENTILE]
FROM BASE_DATA BD
)
,FINAL_SET AS
(
SELECT
CS.SURNAME
,CS.FIRSTNAME
,CS.OCCUPATION
,CS.[SHIFT]
,MAX(CS.[DATE_TIME] ) AS [DATE_TIME]
,MAX(CS.[COUNT] ) AS [COUNT]
,MAX(CS.[AVG_VAL] ) AS [AVG_VAL]
,MAX(CS.[STDEV_VAL] ) AS [STDEV_VAL]
,MAX(CS.[MEDIAN] ) AS [MEDIAN]
,MAX(CS.[99THPERCENTILE]) AS [99THPERCENTILE]
FROM CALCULATED_SET CS
GROUP BY SURNAME, FIRSTNAME, OCCUPATION,[SHIFT],CS.DT_INT
)
SELECT
FS.SURNAME
,FS.FIRSTNAME
,FS.OCCUPATION
,FS.[SHIFT]
,YEAR(FS.[DATE_TIME]) AS [YEAR]
,MONTH(FS.[DATE_TIME]) AS [MONTH]
,DAY(FS.[DATE_TIME]) AS [DAY]
,DATEPART(HOUR,(FS.DATE_TIME))AS [HOUR]
,FS.[COUNT] AS [NO_OF_SAMPLES]
,FS.AVG_VAL AS [AVG]
,FS.STDEV_VAL AS [STD_DEV]
,FS.MEDIAN AS [MEDIAN]
,FS.[99THPERCENTILE] AS [99THPERCENTILE]
FROM FINAL_SET FS
;
Viewing 6 posts - 1 through 6 (of 6 total)
You must be logged in to reply to this topic. Login to reply