-- 1st a straight row count
SELECT count(*)
FROM EE_Table
-- 2nd a select using our definition for a duplicate showing just the duplicates
SELECT Col1,Col2,col3,col4,colN
FROM EE_Table
GROUP BY col1,Col2,col3,col4,colN
HAVING count(*) > 1
-- 3rd which means the rows that we really want to keep are (or use the group by above)
SELECT DISTINCT col1,col2,col3,col4,colN
FROM EE_Table
-- Take a copy using "select into" the table you do select into must not already exist
SELECT * INTO EE_Table_Back FROM EE_Table
-- now empty out the original table - now truncate might fail if there are constraints, but our sample is simple and this works
TRUNCATE TABLE EE_Table
-- then Insert records back with duplicate records filtered out.
INSERT EE_Table (col1,col2,col3,col4, colN)
SELECT DISTINCT col1,col2,col3,col4,colN FROM EE_Table_back
-- 1st a straight row count
SELECT count(*)
FROM EE_Table
-- 2nd a select using our definition for a duplicate showing just the duplicates
SELECT Col1,col3,col4,colN
FROM EE_Table
GROUP BY col1,col3,col4,colN
HAVING count(*) > 1
-- 3rd query. SQL 2005 and up, we can use the row_number() function and a CTE query
;WITH Dupes AS
(Select row_number() over (partition by col1,col3,col4,colN order by col2) as rn, * from EE_Table)
SELECT * FROM dupes WHERE rn = 1
;WITH Dupes as
(Select row_number() over (partition by col1,col3,col4,colN order by col2) as rn, * from EE_Table_Back)
INSERT EE_Table (Col1, Col2, Col3, Col4, ColN)
SELECT Col1, Col2, Col3, Col4, ColN FROM dupes WHERE rn = 1
-- or, since we have a back up any way, we can use the CTE to directly edit the table
;WITH Dupes AS
(Select row_number() over (partition by col1,col3,col4,colN order by col2) as rn, * from EE_Table)
DELETE FROM dupes WHERE rn > 1
-- if you replace the "delete" with "select * " you get a list of the duplicates as mentioned above.
-- add a new identity column to our table - might need to be BIGINT datatype for very large tables, and always take a backup before hand
ALTER TABLE EE_Table ADD dupe_id int identity
-- now we can select just the duplicates
SELECT * FROM EE_Table
WHERE dupe_id <> (select min(dupe_id)
from ee_table d
where d.col1 = EE_table.col1
and d.col3 = EE_Table.col3
and d.col4 = EE_Table.col4
and d.colN = EE_Table.colN)
-- once we are happy with the select, all we need to do is change into a delete
DELETE EE_Table
WHERE dupe_id <> (select min(dupe_id)
from ee_table d
where d.col1 = EE_table.col1
and d.col3 = EE_Table.col3
and d.col4 = EE_Table.col4
and d.colN = EE_Table.colN)
-- and now remove that column
Alter table EE_Table drop column dupe_id
Have a question about something in this article? You can receive help directly from the article author. Sign up for a free trial to get started.
Comments (2)
Commented:
Deleting duplicates can be hard work, and I think you have covered off the more popular approaches pretty well.
I also like the importance of taking a backup first :)
Commented:
Thanks,
Rod