--© 2021 | ByrdNest Consulting
--Multi-Column Nonclustered Indexes Selection


SELECT @@VERSION;
/*
Microsoft SQL Server 2019 (RTM-CU10) (KB5001090) - 15.0.4123.1 (X64) 
	Mar 22 2021 18:10:24 
	Copyright (C) 2019 Microsoft Corporation
	Developer Edition (64-bit) on Windows 10 Pro 10.0 <X64> (Build 19041: )
*/

--start with clean copy of AdventureWorks2017Big
USE [master]
GO
ALTER DATABASE [AdventureWorks2017Big] 
	SET SINGLE_USER WITH ROLLBACK IMMEDIATE			--kick off all users except me
RESTORE DATABASE [AdventureWorks2017Big] FROM  DISK = N'E:\SQL_Data2019\Backups\AdventureWorks2017Big.bak' WITH  FILE = 1,  
	MOVE N'AdventureWorks2017' TO N'D:\SQL_Data2019\Data\AdventureWorks2017Big.mdf',  
	MOVE N'AdventureWorks2017_log' TO N'D:\SQL_Data2019\Logs\AdventureWorks2017Big_log.ldf',  NOUNLOAD,  REPLACE,  STATS = 10;
ALTER DATABASE [AdventureWorks2017Big] SET MULTI_USER	--set back to multi-user
GO
ALTER AUTHORIZATION ON DATABASE::AdventureWorks2017Big TO sa;	--give ownership to sa; not me
GO
ALTER DATABASE [AdventureWorks2017Big] SET COMPATIBILITY_LEVEL = 150	-- make it SS2019 compatible; but these scripts will run any version of SS from 2008
GO

USE AdventureWorks2017Big
GO
--00:00:25

--data is too even in SalesPerson table; below script adds more rows for SalePersonID = 289
UPDATE Sales.SalesOrderHeaderBig
	SET SalesPersonID = CASE WHEN SalesOrderID % 20 IN (0,1,2,3,4,5,6,7,8,9) THEN 289
						ELSE SalesPersonID END
	WHERE SalesPersonID IS NULL
GO

--Drop all nonclustered indexes
DROP INDEX IF EXISTS [AK_SalesOrderHeader_rowguid] ON [Sales].[SalesOrderHeader]
GO
DROP INDEX IF EXISTS [AK_SalesOrderHeader_SalesOrderNumber] ON [Sales].[SalesOrderHeader]
GO
DROP INDEX IF EXISTS [IX_SalesOrderHeader_CustomerID] ON [Sales].[SalesOrderHeader]
GO
DROP INDEX IF EXISTS [IX_SalesOrderHeader_SalesPersonID] ON [Sales].[SalesOrderHeader]
GO

--Clustered index for Sales.SalesOrderHeader based on single column SalesOrderID



SELECT [Status], count(*) StatusCount,SUM(CASE WHEN ModifiedDate < '1/1/2014' THEN 0 ELSE 1 END) [StatusCountGT_1/1/2014]
	FROM [Sales].[SalesOrderHeaderBig]
	GROUP BY Status
	ORDER BY 1;
GO
/*
Status	StatusCount	StatusCountGT_1/1/2014
0		154297		131023
1		154296		127461
2		154297		127464
3		154296		127461
4		159552		127462		--arbitraily selected for query
5		161006		127463
6		161004		127461
7		159852		132600
*/

SELECT SalesPersonID, COUNT(*) SalesPersonIDCount
	,SUM(CASE WHEN ModifiedDate < '1/1/2014' THEN 0 ELSE 1 END) [SalesPersonIDCountGT_1/1/2014]
	FROM [Sales].[SalesOrderHeaderBig]
	GROUP BY SalesPersonID
	ORDER BY 1 ;
GO
/*
SalesPersonID	SalesPersonIDCount	SalesPersonIDCountGT_1/1/2014
NULL			642288				464246
274				1920				1151
275				18000				10880
276				16720				10501
277				18920				11723
278				9360				5868
279				17160				10145
280				3800				1674
281				9680				6056
282				10840				6009
283				7560				4623
284				5600				4675
285				640					577	
286				4360				3985
287				1560				1251
288				5200				4721
289				477992				474716		--selected for query
290				7000				5594
*/

DBCC FREEPROCCACHE		-- don't use this in a production database
GO

--	This is the baseline query we are using for the index testing
SET STATISTICS IO, TIME ON
GO
SELECT [Status],SalesPersonID,ModifiedDate,TotalDue
	FROM Sales.SalesOrderHeaderBig
	WHERE [Status] = 4
	  AND SalesPersonID = 289
	  AND ModifiedDate >= '1/1/2014'
	ORDER BY [Status],SalesPersonID,ModifiedDate
GO
SET STATISTICS IO,TIME OFF
GO
/*	Clustered Index Scan
	58,967 rows
	29295 logical reads
	cpu = 588 ms
	elapsed time = 584
	total query cost = 24.4894
	and the parameters in the Predicate same order as in the WHERE clause
*/

--	let's see if there is any difference if I move the ModifiedDate to the first parameter in the WHERE clause
DBCC FREEPROCCACHE		-- don't use this in a production database
GO
SET STATISTICS IO, TIME ON
GO
SELECT [Status],SalesPersonID,ModifiedDate,TotalDue
	FROM Sales.SalesOrderHeaderBig
	WHERE ModifiedDate >= '1/1/2014'
	  AND [Status] = 4
	  AND SalesPersonID = 289
	ORDER BY [Status],SalesPersonID,ModifiedDate
GO
SET STATISTICS IO,TIME OFF
GO
/*
	58,967 rows
	29295 logical reads
	cpu = 330 ms	-- I'm not sure why this is lower, but possibily due to how the data is sorted in the clustered index
	elapsed time = 538
	total query cost = 24.4894
	and the parameters in the Predicate same order as in the WHERE clause
*/


--create 3 different indexes containing same columns in differing order; 
--	IX_SalesOrderHeader_StatusSalesPersonIDModifiedDate is the suggested index
CREATE NONCLUSTERED INDEX IX_SalesOrderHeader_StatusSalesPersonIDModifiedDate 
	ON Sales.SalesOrderHeaderBig ([Status] ASC, SalesPersonID ASC, ModifiedDate ASC)
	INCLUDE (TotalDue)
	WITH (Data_Compression = ROW)
GO
CREATE NONCLUSTERED INDEX IX_SalesOrderHeader_SalesPersonIDStatusModifiedDate 
	ON Sales.SalesOrderHeaderBig (SalesPersonID ASC,[Status] ASC, ModifiedDate ASC)
	INCLUDE (TotalDue)
	WITH (Data_Compression = ROW)
GO
CREATE NONCLUSTERED INDEX IX_SalesOrderHeader_ModifiedDateStatusSalesPersonID 
	ON Sales.SalesOrderHeaderBig ( ModifiedDate ASC,[Status] ASC, SalesPersonID ASC)
	INCLUDE (TotalDue)
	WITH (Data_Compression = ROW)
GO


--now rerun original query (with all 3 indexes active as defined above)
DBCC FREEPROCCACHE		-- don't use this in a production database
GO
SET STATISTICS IO, TIME ON
GO
SELECT [Status],SalesPersonID,ModifiedDate,TotalDue
	FROM Sales.SalesOrderHeaderBig
	WHERE [Status] = 4
	  AND SalesPersonID = 289
	  AND ModifiedDate >= '1/1/2014'
	ORDER BY [Status],SalesPersonID,ModifiedDate
GO
SET STATISTICS IO, TIME OFF
GO
/*	Non-clustered Index seek
	58,967 rows
	174 logical reads
	cpu = 39 ms	
	elapsed time = 529
	total query cost = 0.306444
	but used IX_SalesOrderHeader_SalesPersonIDStatusModifiedDate instead of suggested IX_SalesOrderHeader_StatusSalesPersonIDModifiedDate
*/

--so let's force the query to use the suggested index
DBCC FREEPROCCACHE		-- don't use this in a production database
GO
SET STATISTICS IO, TIME ON
GO
SELECT [Status],SalesPersonID,ModifiedDate,TotalDue
	FROM Sales.SalesOrderHeaderBig 
		WITH (INDEX (IX_SalesOrderHeader_StatusSalesPersonIDModifiedDate))
 	WHERE [Status] = 7
	  AND SalesPersonID = 289
	  AND ModifiedDate BETWEEN '1/1/2014' AND '6/30/2021'
	ORDER BY [Status],SalesPersonID,ModifiedDate
GO
SET STATISTICS IO, TIME OFF
GO
/*	Non-clustered index seek
	58,967 rows
	182 logical reads
	cpu = 42 ms	
	elapsed time = 547
	total query cost = 0.30707
	slightly higher performance cost in logical reads, cpu, elapsed time, and total query cost;
	but still much better than original query with no non-clustered indexex
*/

-- so for completeness, let's force the query to use the index with ModifiedDate as the first column
DBCC FREEPROCCACHE		-- don't use this in a production database
GO
SET STATISTICS IO,TIME ON
GO
SELECT [Status],SalesPersonID,ModifiedDate,TotalDue
	FROM Sales.SalesOrderHeaderBig 
		WITH (INDEX (IX_SalesOrderHeader_ModifiedDateStatusSalesPersonID))
 	WHERE [Status] = 4
	  AND SalesPersonID = 289
	  AND ModifiedDate BETWEEN '1/1/2014' AND '6/30/2021'
	ORDER BY [Status],SalesPersonID,ModifiedDate
GO
SET STATISTICS IO,TIME OFF
GO
/*	Non-clustered index seek
	58,967 rows
	2868 logical reads
	cpu = 118 ms	
	elapsed time = 535
	total query cost = 3.43921
	even higher performance cost in logical reads, cpu, elapsed time, and total query cost;
	but still much better than original query with no non-clustered indexex
*/


--let's look at how the data is organized in each column

--Looking at the suggested index ordering (IX_SalesOrderHeader_StatusSalesPersonIDModifiedDate):
SELECT [Status], COUNT(*) StatusCount
	FROM Sales.SalesOrderHeaderBig
	GROUP BY [Status]
	ORDER BY [Status]
/*
Status	StatusCount
0		154297
1		154296
2		154297
3		154296
4		159552
5		161006
6		161004
7		159852
*/
--	there are 159,552 rows with status = 4

SELECT [Status],SalesPersonID, COUNT(*) SalesPersonIDCount
	FROM Sales.SalesOrderHeaderBig
	WHERE [Status] = 4
	GROUP BY [Status],SalesPersonID
	ORDER BY [Status],SalesPersonID
/*
Status	SalesPersonID	SalesPersonIDCount
4		NULL			81738
4		274				265
4		275				2336
4		276				2235
4		277				2537
4		278				1262
4		279				2294
4		280				518
4		281				1263
4		282				1451
4		283				1004
4		284				710
4		285				93
4		286				564
4		287				198
4		288				660
4		289				59496
4		290				928
*/	
-- there are 59,496 rows with Status = 4 and SalesPersonID = 289

SELECT COUNT(*) ModifiedDateCount
	FROM Sales.SalesOrderHeaderBig
	WHERE [Status] = 4
	  AND SalesPersonID = 289
	  AND ModifiedDate BETWEEN '1/1/2014' AND '6/30/2021'
/*
ModifiedDateCount
58967
*/

--whereas looking at the best performing index (IX_SalesOrderHeader_SalesPersonIDStatusModifiedDate)
SELECT SalesPersonID, COUNT(*) SalesPersonIDCount
	FROM Sales.SalesOrderHeaderBig
	GROUP BY SalesPersonID
	ORDER BY SalesPersonID
/*
SalesPersonID	SalesPersonIDCount
NULL	642288
274		1920
275		18000
276		16720
277		18920
278		9360
279		17160
280		3800
281		9680
282		10840
283		7560
284		5600
285		640
286		4360
287		1560
288		5200
289		477992
290		7000
*/
--	there are 477,992 rows with SalesPersonID = 289

SELECT SalesPersonID,[Status], COUNT(*) StatusIDCount
	FROM Sales.SalesOrderHeaderBig
	WHERE SalesPersonID = 289
	GROUP BY SalesPersonID,[Status]
	ORDER BY SalesPersonID,[Status]
/*
SalesPersonID	Status	StatusIDCount
289				0		60360
289				1		59031
289				2		59305
289				3		58856
289				4		59496
289				5		59469
289				6		59673
289				7		61802
*/	
--And now there are 59,496 rows for SalesPersonID = 289 and Status = 4




USE AdventureWorks2017Big
GO
dbcc showcontig ('Sales.SalesOrderHeaderBig') with tableresults, all_indexes, all_levels;
GO

--generate data for spreadsheet (data distribution within each non-clustered index)
SELECT SalesPersonID,[Status], SUBSTRING(CONVERT(VARCHAR(25),ModifiedDate,112),1,6)
		, count(*)
	FROM Sales.SalesOrderHeaderBig
	WHERE SalesPersonID IN (288,289,290)
	  AND [Status] IN (3,4,5)
	  AND ModifiedDate >= '12/1/2013'
	GROUP BY SalesPersonID,[Status], SUBSTRING(CONVERT(VARCHAR(25),ModifiedDate,112),1,6)
	ORDER BY 1,2,3
GO

SELECT [Status],SalesPersonID, SUBSTRING(CONVERT(VARCHAR(25),ModifiedDate,112),1,6)
		, count(*)
	FROM Sales.SalesOrderHeaderBig
	WHERE SalesPersonID IN (288,289,290)
	  AND [Status] IN (3,4,5)
	  AND ModifiedDate >= '12/1/2013'
	GROUP BY [Status],SalesPersonID, SUBSTRING(CONVERT(VARCHAR(25),ModifiedDate,112),1,6)
	ORDER BY 1,2,3
GO

SELECT SUBSTRING(CONVERT(VARCHAR(25),ModifiedDate,112),1,6),[Status],SalesPersonID
		, count(*)
	FROM Sales.SalesOrderHeaderBig
	WHERE SalesPersonID IN (288,289,290)
	  AND [Status] IN (3,4,5)
	  AND ModifiedDate >= '12/1/2013'
	GROUP BY SUBSTRING(CONVERT(VARCHAR(25),ModifiedDate,112),1,6),[Status],SalesPersonID
	ORDER BY 1,2,3
GO