See what you think of this...
-- Add the two new columns to dimMember...
--ALTER TABLE dbo.dimMember ADD GroupID INT;
--ALTER TABLE dbo.dimMember ADD MatchQuality INT;
IF OBJECT_ID('tempdb..#MatchScores') IS NOT NULL
DROP TABLE #MatchScores;
CREATE TABLE #MatchScores (
ParentID INT,
ChildID INT,
MemberCodeScore INT,
FirstNameScore INT,
LastNameScore INT,
DOB_Score INT,
SSN_Score INT,
SubscriberCodeScore INT,
Add1_Score INT,
Add2_Score INT,
TotalScore INT
);
IF OBJECT_ID('tempdb..#BestMatches') IS NOT NULL
DROP TABLE #BestMatches;
CREATE TABLE #BestMatches (
ParentID INT,
ChildID INT,
BestScore INT,
Lft INT,
Rgt INT
);
-- Take a look at the base data.
--SELECT * FROM dbo.dimMember dm;
WITH ColumnScores AS (-- Assign weights to the various attributes
SELECT
dm1.dimMemberId AS ParentID,
dm2.dimMemberId AS ChildID,
CASE WHEN dm1.MemberCode = dm2.MemberCode THEN 3 ELSE 0 END AS MemberCodeScore,
CASE WHEN dm1.FirstName = dm2.FirstName THEN 1 ELSE 0 END AS FirstNameScore,
CASE WHEN dm1.LastName = dm2.LastName THEN 2 ELSE 0 END AS LastNameScore,
CASE WHEN dm1.dimDayidDOB = dm2.dimDayidDOB THEN 1 ELSE 0 END AS DOB_Score,
CASE WHEN NULLIF(dm1.SSN, 'n/a') = NULLIF(dm2.SSN, 'n/a') THEN 6 ELSE 0 END AS SSN_Score, -- Using NULLIF to prevent "n/a" from causing valid matches.
CASE WHEN dm1.SubscriberCode = dm2.SubscriberCode THEN 1 ELSE 0 END AS SubscriberCodeScore,
CASE WHEN NULLIF(dm1.CurrentAddress1, 'n/a') = NULLIF(dm2.CurrentAddress1, 'n/a') THEN 2 ELSE 0 END AS Add1_Score,
CASE WHEN NULLIF(dm1.CurrentAddress2, 'n/a') = NULLIF(dm2.CurrentAddress2, 'n/a') THEN 1 ELSE 0 END AS Add2_Score
FROM
dbo.dimMember dm1
JOIN dbo.dimMember dm2
ON dm1.dimMemberId < dm2.dimMemberId
)
INSERT #MatchScores (ParentID,ChildID,MemberCodeScore,FirstNameScore,LastNameScore,SSN_Score,SubscriberCodeScore,
Add1_Score,Add2_Score,TotalScore)
SELECT
cs.ParentID,
cs.ChildID,
cs.MemberCodeScore,
cs.FirstNameScore,
cs.LastNameScore,
cs.SSN_Score,
cs.SubscriberCodeScore,
cs.Add1_Score,
cs.Add2_Score,
cs.MemberCodeScore + cs.FirstNameScore + cs.LastNameScore + cs.DOB_Score + cs.SSN_Score + SubscriberCodeScore + cs.Add1_Score +
cs.Add2_Score AS TotalScore
FROM
ColumnScores cs
WHERE
MemberCodeScore + cs.FirstNameScore + cs.LastNameScore + cs.DOB_Score + cs.SSN_Score + SubscriberCodeScore + cs.Add1_Score +
cs.Add2_Score > 4 -- Set a threashold for total match quality
ORDER BY
cs.ParentID;
--===============================================================
INSERT #BestMatches (ParentID,ChildID,BestScore)
SELECT
bs.ParentID,
bs.ChildID,
bs.TotalScore AS BestScore
FROM
(SELECT DISTINCT ms1.ChildID
FROM #MatchScores ms1) c
CROSS APPLY (
SELECT TOP 1
*
FROM #MatchScores ms2
WHERE c.ChildID = ms2.ChildID
ORDER BY ms2.TotalScore DESC
) bs;
INSERT #BestMatches (ParentID,ChildID,BestScore)
SELECT ParentID,ChildID,BestScore FROM (VALUES (NULL, 0, 100)) x (ParentID,ChildID,BestScore)
UNION ALL
SELECT DISTINCT
0 AS ParentID,
dm.dimMemberId AS ChildID,
100 AS BestScore
FROM dbo.dimMember dm-- Change from the original script... Pulling the "Parent-less" from dimMember instead of #BestMatches.
WHERE NOT EXISTS (SELECT 1 FROM #BestMatches bm WHERE dm.dimMemberId = bm.ChildID);
--==========================================================================
-- Use nested sets to create the groups
EXEC dbo.CreateNestedSets-- This proc is simply a dynamic version of Jeff Modens "Hierarchies on Steroids #1: Convert an Adjacency List to Nested Sets" script.
@TableName = '#BestMatches',
@ChildColName = 'ChildID',
@ParentColName = 'ParentID',
@LeftColName = 'Lft',
@RightColName = 'Rgt',
@DeBug = 0;
DELETE #BestMatches WHERE ParentID IS NULL;
--==========================================================================
UPDATE dm SET
dm.GroupID = bm.GroupID,
dm.MatchQuality = bm.BestScore
FROM
dbo.dimMember dm
JOIN (
SELECT
Groups.ChildID AS GroupID,
bm2.ChildID,
bm2.BestScore
FROM (
SELECT
bm1.ChildID,
bm1.Lft,
bm1.Rgt
FROM
#BestMatches bm1
WHERE
bm1.ParentID = 0
) Groups
JOIN #BestMatches bm2
ON bm2.Lft BETWEEN Groups.Lft AND Groups.Rgt
) bm
ON dm.dimMemberId = bm.ChildID;
SELECT * FROM dbo.dimMember dm
See the attached spreadsheet for the results...