Home Forums SQL Server 2008 T-SQL (SS2K8) De-duplicating Customer Detail and assigning parent child structure to multiple customer records RE: De-duplicating Customer Detail and assigning parent child structure to multiple customer records

  • Have a play with this, Matt. Hopefully it will give you some ides.

    IF OBJECT_ID( 'tempdb..#rawdata') IS NOT NULL DROP TABLE #rawdata

    IF OBJECT_ID( 'tempdb..#WorkingSet') IS NOT NULL DROP TABLE #WorkingSet

    CREATE TABLE #rawdata (dateid DATETIME, CustID INT, CCNumber CHAR(9), First_Name VARCHAR(10), Last_Name VARCHAR(10))

    INSERT INTO #rawdata (dateid, CustID, CCNumber, First_Name, Last_Name)

    SELECT '20090503', 7320, '370090510', 'Jane', 'Smith' union

    SELECT '20090420', 25861, '370090510', 'Jane', 'Smith' union

    SELECT '20100404', 7320, '370080987', 'Jane', 'Smith' union

    SELECT '20090703', 25861, '370080987', 'Jane', 'Smith' union

    SELECT '20100603', 7320, '594220191', 'Jane', 'Smith' union

    SELECT '20100705', 25861, '594220191', 'Jane', 'Smith' union

    SELECT '20100719', 7320, '594220654', 'Jane', 'Smith' union

    SELECT '20100720', 7321, '594220654', 'Jane', 'Smith'

    SELECT

    DupesetID = CAST(NULL AS BIGINT),

    r1.*,

    r2_CustID = r2.CustID

    INTO #WorkingSet

    FROM #rawdata r1

    INNER JOIN #rawdata r2 ON r2.CCNumber = r1.CCNumber AND r2.CustID <> r1.CustID

    ORDER BY r1.CustID

    -- assign dupesetID

    UPDATE d SET DupesetID = s.DupesetID

    FROM #WorkingSet d

    INNER JOIN (

    SELECT DupesetID = ROW_NUMBER() OVER (ORDER BY Last_Name, CCNumber),

    Last_Name, CCNumber

    FROM #WorkingSet

    GROUP BY Last_Name, CCNumber

    ) s ON s.Last_Name = d.Last_Name

    AND s.CCNumber = d.CCNumber

    -- merge sets with same customerID to earliest DupesetID

    UPDATE d SET DupesetID = s.DupesetID

    FROM #WorkingSet d

    INNER JOIN (

    SELECT CustID, DupesetID = MIN(DupesetID), OLDDupesetID = MAX(DupesetID)

    FROM #WorkingSet

    GROUP BY CustID

    HAVING COUNT(*) > 1 AND MIN(DupesetID) <> MAX(DupesetID)

    ) s ON s.OLDDupesetID = d.DupesetID

    -- merge again if necessary

    -- assign principal, and number of rows in dupeset

    SELECT MasterCustomer = ROW_NUMBER() OVER (PARTITION BY d.DupesetID ORDER BY dateid DESC),

    DupesetSize = COUNT(*) over (PARTITION BY d.DupesetID),

    d.DupesetID, d.dateid, d.CustID, d.CCNumber, d.First_Name, d.Last_Name, d.r2_CustID

    FROM #WorkingSet d

    ORDER BY DupesetID, MasterCustomer

    “Write the query the simplest way. If through testing it becomes clear that the performance is inadequate, consider alternative query forms.” - Gail Shaw

    For fast, accurate and documented assistance in answering your questions, please read this article.
    Understanding and using APPLY, (I) and (II) Paul White
    Hidden RBAR: Triangular Joins / The "Numbers" or "Tally" Table: What it is and how it replaces a loop Jeff Moden