• See what you think of this...

    -- Add the two new columns to dimMember...

    --ALTER TABLE dbo.dimMember ADD GroupID INT;

    --ALTER TABLE dbo.dimMember ADD MatchQuality INT;

    IF OBJECT_ID('tempdb..#MatchScores') IS NOT NULL

    DROP TABLE #MatchScores;

    CREATE TABLE #MatchScores (

    ParentID INT,

    ChildID INT,

    MemberCodeScore INT,

    FirstNameScore INT,

    LastNameScore INT,

    DOB_Score INT,

    SSN_Score INT,

    SubscriberCodeScore INT,

    Add1_Score INT,

    Add2_Score INT,

    TotalScore INT

    );

    IF OBJECT_ID('tempdb..#BestMatches') IS NOT NULL

    DROP TABLE #BestMatches;

    CREATE TABLE #BestMatches (

    ParentID INT,

    ChildID INT,

    BestScore INT,

    Lft INT,

    Rgt INT

    );

    -- Take a look at the base data.

    --SELECT * FROM dbo.dimMember dm;

    WITH ColumnScores AS (-- Assign weights to the various attributes

    SELECT

    dm1.dimMemberId AS ParentID,

    dm2.dimMemberId AS ChildID,

    CASE WHEN dm1.MemberCode = dm2.MemberCode THEN 3 ELSE 0 END AS MemberCodeScore,

    CASE WHEN dm1.FirstName = dm2.FirstName THEN 1 ELSE 0 END AS FirstNameScore,

    CASE WHEN dm1.LastName = dm2.LastName THEN 2 ELSE 0 END AS LastNameScore,

    CASE WHEN dm1.dimDayidDOB = dm2.dimDayidDOB THEN 1 ELSE 0 END AS DOB_Score,

    CASE WHEN NULLIF(dm1.SSN, 'n/a') = NULLIF(dm2.SSN, 'n/a') THEN 6 ELSE 0 END AS SSN_Score, -- Using NULLIF to prevent "n/a" from causing valid matches.

    CASE WHEN dm1.SubscriberCode = dm2.SubscriberCode THEN 1 ELSE 0 END AS SubscriberCodeScore,

    CASE WHEN NULLIF(dm1.CurrentAddress1, 'n/a') = NULLIF(dm2.CurrentAddress1, 'n/a') THEN 2 ELSE 0 END AS Add1_Score,

    CASE WHEN NULLIF(dm1.CurrentAddress2, 'n/a') = NULLIF(dm2.CurrentAddress2, 'n/a') THEN 1 ELSE 0 END AS Add2_Score

    FROM

    dbo.dimMember dm1

    JOIN dbo.dimMember dm2

    ON dm1.dimMemberId < dm2.dimMemberId

    )

    INSERT #MatchScores (ParentID,ChildID,MemberCodeScore,FirstNameScore,LastNameScore,SSN_Score,SubscriberCodeScore,

    Add1_Score,Add2_Score,TotalScore)

    SELECT

    cs.ParentID,

    cs.ChildID,

    cs.MemberCodeScore,

    cs.FirstNameScore,

    cs.LastNameScore,

    cs.SSN_Score,

    cs.SubscriberCodeScore,

    cs.Add1_Score,

    cs.Add2_Score,

    cs.MemberCodeScore + cs.FirstNameScore + cs.LastNameScore + cs.DOB_Score + cs.SSN_Score + SubscriberCodeScore + cs.Add1_Score +

    cs.Add2_Score AS TotalScore

    FROM

    ColumnScores cs

    WHERE

    MemberCodeScore + cs.FirstNameScore + cs.LastNameScore + cs.DOB_Score + cs.SSN_Score + SubscriberCodeScore + cs.Add1_Score +

    cs.Add2_Score > 4 -- Set a threashold for total match quality

    ORDER BY

    cs.ParentID;

    --===============================================================

    INSERT #BestMatches (ParentID,ChildID,BestScore)

    SELECT

    bs.ParentID,

    bs.ChildID,

    bs.TotalScore AS BestScore

    FROM

    (SELECT DISTINCT ms1.ChildID

    FROM #MatchScores ms1) c

    CROSS APPLY (

    SELECT TOP 1

    *

    FROM #MatchScores ms2

    WHERE c.ChildID = ms2.ChildID

    ORDER BY ms2.TotalScore DESC

    ) bs;

    INSERT #BestMatches (ParentID,ChildID,BestScore)

    SELECT ParentID,ChildID,BestScore FROM (VALUES (NULL, 0, 100)) x (ParentID,ChildID,BestScore)

    UNION ALL

    SELECT DISTINCT

    0 AS ParentID,

    dm.dimMemberId AS ChildID,

    100 AS BestScore

    FROM dbo.dimMember dm-- Change from the original script... Pulling the "Parent-less" from dimMember instead of #BestMatches.

    WHERE NOT EXISTS (SELECT 1 FROM #BestMatches bm WHERE dm.dimMemberId = bm.ChildID);

    --==========================================================================

    -- Use nested sets to create the groups

    EXEC dbo.CreateNestedSets-- This proc is simply a dynamic version of Jeff Modens "Hierarchies on Steroids #1: Convert an Adjacency List to Nested Sets" script.

    @TableName = '#BestMatches',

    @ChildColName = 'ChildID',

    @ParentColName = 'ParentID',

    @LeftColName = 'Lft',

    @RightColName = 'Rgt',

    @DeBug = 0;

    DELETE #BestMatches WHERE ParentID IS NULL;

    --==========================================================================

    UPDATE dm SET

    dm.GroupID = bm.GroupID,

    dm.MatchQuality = bm.BestScore

    FROM

    dbo.dimMember dm

    JOIN (

    SELECT

    Groups.ChildID AS GroupID,

    bm2.ChildID,

    bm2.BestScore

    FROM (

    SELECT

    bm1.ChildID,

    bm1.Lft,

    bm1.Rgt

    FROM

    #BestMatches bm1

    WHERE

    bm1.ParentID = 0

    ) Groups

    JOIN #BestMatches bm2

    ON bm2.Lft BETWEEN Groups.Lft AND Groups.Rgt

    ) bm

    ON dm.dimMemberId = bm.ChildID;

    SELECT * FROM dbo.dimMember dm

    See the attached spreadsheet for the results...