• Paul White

    SSC Guru

    Points: 150341

    Super-fast DISTINCT using a recursive CTE:

    USE tempdb;

    GO

    DROP TABLE dbo.Test;

    GO

    CREATE TABLE

    dbo.Test

    (

    data INTEGER NOT NULL,

    );

    GO

    CREATE CLUSTERED INDEX c ON dbo.Test (data);

    GO

    -- Lots of duplicated values

    INSERT dbo.Test WITH (TABLOCK)

    (data)

    SELECT TOP (5000000)

    ROW_NUMBER() OVER (ORDER BY (SELECT 0)) / 117329

    FROM master.sys.columns C1,

    master.sys.columns C2,

    master.sys.columns C3;

    GO

    SET STATISTICS TIME ON;

    -- 1591ms CPU

    SELECT DISTINCT

    data

    FROM dbo.Test;

    -- 15ms CPU

    WITH RecursiveCTE

    AS (

    SELECT data = MIN(T.data)

    FROM dbo.Test T

    UNION ALL

    SELECT R.data

    FROM (

    -- A cunning way to use TOP in the recursive part of a CTE 🙂

    SELECT T.data,

    rn = ROW_NUMBER() OVER (ORDER BY T.data)

    FROM dbo.Test T

    JOIN RecursiveCTE R

    ON R.data < T.data

    ) R

    WHERE R.rn = 1

    )

    SELECT *

    FROM RecursiveCTE

    OPTION (MAXRECURSION 0);

    SET STATISTICS TIME OFF;

    GO

    DROP TABLE dbo.Test;

    The recursive CTE is 100 times more efficient 🙂