Super-fast DISTINCT using a recursive CTE:
USE tempdb;
GO
DROP TABLE dbo.Test;
GO
CREATE TABLE
dbo.Test
(
data INTEGER NOT NULL,
);
GO
CREATE CLUSTERED INDEX c ON dbo.Test (data);
GO
-- Lots of duplicated values
INSERT dbo.Test WITH (TABLOCK)
(data)
SELECT TOP (5000000)
ROW_NUMBER() OVER (ORDER BY (SELECT 0)) / 117329
FROM master.sys.columns C1,
master.sys.columns C2,
master.sys.columns C3;
GO
SET STATISTICS TIME ON;
-- 1591ms CPU
SELECT DISTINCT
data
FROM dbo.Test;
-- 15ms CPU
WITH RecursiveCTE
AS (
SELECT data = MIN(T.data)
FROM dbo.Test T
UNION ALL
SELECT R.data
FROM (
-- A cunning way to use TOP in the recursive part of a CTE :)
SELECT T.data,
rn = ROW_NUMBER() OVER (ORDER BY T.data)
FROM dbo.Test T
JOIN RecursiveCTE R
ON R.data < T.data
) R
WHERE R.rn = 1
)
SELECT *
FROM RecursiveCTE
OPTION (MAXRECURSION 0);
SET STATISTICS TIME OFF;
GO
DROP TABLE dbo.Test;
The recursive CTE is 100 times more efficient 🙂