RE: How to eliminate duplicate records from a database table?

SSCommitted

Points: 1517

January 6, 2009 at 4:01 am

I'm not sure I'd use this script on a table with 10 million rows in it due to the potential for the cursor to become unwieldy if there are a large number of duplicates. I have eliminated duplicates using this kind of approach before:

(you can cut and paste this entire script into QA/MS and run it with no modifications)

/* -----------------------------------------------------------------

Script by Malcolm Leach 6th Jan 2009, http://www.dbghost.com

Set up test database for demo

----------------------------------------------------------------- */

use master

if exists (select 1 from master..sysdatabases where name = 'EliminateDuplicatesDemo')

drop database EliminateDuplicatesDemo

create database EliminateDuplicatesDemo

use EliminateDuplicatesDemo

create table DupTable (id int, col1 varchar(50))

insert into DupTable values(1,'some text')

insert into DupTable values(2,'some text2')

-- the ID is a duplicate on the following row but the value of col1 is not...

insert into DupTable values(2,'this is different but with the same id')

insert into DupTable values(3,'some text3')

insert into DupTable values(4,'some text4')

select * from duptable

begin tran

/* -----------------------------------------------------------------

Extract all rows with duplicate ID's to a temp table

----------------------------------------------------------------- */

-- get all the duplicated ID's first

select id into TempDupIds from DupTable group by id having count(id) > 1

select distinct dt.id, dt.col1

into TempDupTableDupRows

from DupTable dt

inner join TempDupIds di on di.id = dt.id

-- this query will show up any rows that have duplicate IDs but different data

-- these rows will need "special attention"

select id from TempDupTableDupRows group by id having count(id) > 1

/* -----------------------------------------------------------------

Delete all the rows with duplicate ID's from the main table

----------------------------------------------------------------- */

delete from DupTable where id in (select id from TempDupIds)

/* -----------------------------------------------------------------

Put the rows back into the main table

----------------------------------------------------------------- */

insert into DupTable select * from TempDupTableDupRows

commit tran

-- show the cleaned up table (the problem row is still there so no data has been lost)

select * from duptable

Malcolm
DB Ghost - Build, compare and synchronize from source control = Database Change Management for SQL Server
www.dbghost.com