Link to home
Start Free TrialLog in
Avatar of richa1960
richa1960Flag for United States of America

asked on

Remove large number of records from a sql database table 5k to 10k at a time.

I have to remove over 800K records from a sql database, but would like to remove the records with a batch process of 5K to 10K at a time.

I've gathered my count using the query one below, and came up with the delete process in query two.

QUERY ONE:
WITH CTE AS (
    Select
        D_Item_GroupSK
    FROM
        DWH.F_Line_item_Shipment
    UNION ALL
       Select
        D_Item_GroupSK
    FROM
        DWH.F_Line_item_MBL
)
SELECt
    count(D.D_Item_GroupSK)
FROM
    DWH.D_Item_Group D LEFT OUTER JOIN
    CTE C ON D.D_Item_GroupSK = C.D_Item_GroupSK
WHERE
    C.D_Item_GroupSK IS NULL


Query two:
use [Datamart]
GO
DECLARE @batch INT
SET @batch = 10000
DECLARE @cnt INT
SET @cnt = 0
DECLARE @cntmax INT
SET @cntmax = 10
WHILE @cnt < @cntmax
BEGIN
  SET @cnt = @cnt + 1
  DELETE TOP(@batch) FROM DWH.F_Line_item_Shipment WHERE C.D_Item_GroupSK IS NULL
END
Avatar of David Todd
David Todd
Flag of New Zealand image

Hi,

The where clause in your delete is wrong. Its wrong on two counts - you're specifying the value as a character when the variable is an integer; in the line above, on the first iteration you are setting the value of @cnt to 1, so the delete wont delete anything.

Here is some code I've been working on - generates a table and deletes from it. The deletion is random within the range. Even when I added a clustered index to the table I still got random deletions, as couldn't specify an order by clause in the delete. But if order by is specified in the selection of id's to delete, you'll see that the third delete takes out ids 1500 through 1599 inclusive; even though the range is 1500 - 1699

HTH
  David

/*
use [Datamart] 
GO
DECLARE @batch INT
SET @batch = 10000 
DECLARE @cnt INT
SET @cnt = 0 
DECLARE @cntmax INT
SET @cntmax = 10 
WHILE @cnt < @cntmax
BEGIN
  SET @cnt = @cnt + 1
  DELETE TOP(@batch) FROM DWH.F_Line_item_Shipment WHERE @cnt = '0'
END 

*/

-- foxtrot\SQL2017
use EE
go

if exists(
	select *
	from sys.objects
	where
		object_id = object_id( N'dbo.F_Line_item_Shipment' )
		and type in( N'U' )
	)
	drop table dbo.F_Line_item_Shipment
;
go

if not exists(
	select *
	from sys.objects
	where
		object_id = object_id( N'dbo.F_Line_item_Shipment' )
		and type in( N'U' )
	)
	create table dbo.F_Line_item_Shipment (
		F_Line_item_ShipmentID int not null
	) on [PRIMARY]
go

if object_id( N'tempdb..#Digit', N'U' ) is not null 
	drop table #Digit;
	
create table #Digit(
	d int
	)
	
insert #Digit( d ) values( 0 ), ( 1 ), ( 2 ), ( 3 ), ( 4 ), ( 5 ), ( 6 ), ( 7 ), ( 8 ), ( 9 )
;

--select *
--from #Digit
--go

if object_id( N'tempdb..#Number', N'U' ) is not null 
	drop table #Number;
	
create table #Number(
	d int
	)

insert #Number( d )	
	select ((( tt.d * 10 +  th.d ) * 10 + h.d ) * 10 + t.d ) * 10 + o.d 
	from #Digit o
	cross join #Digit t
	cross join #Digit h
	cross join #Digit th
	cross join #Digit tt
;

insert dbo.F_Line_item_Shipment( F_Line_item_ShipmentID )
	select d
	from #Number 
;

select count( * )
from dbo.F_Line_item_Shipment
;

-- delete
DECLARE @batch INT
SET @batch = 10 
DECLARE @cnt INT
SET @cnt = 0 
DECLARE @cntmax INT
SET @cntmax = 10 
WHILE @cnt < @cntmax
BEGIN
  SET @cnt = @cnt + 1
  DELETE TOP(@batch) FROM dbo.F_Line_item_Shipment 
  where
	F_Line_item_ShipmentID in (
		select d
		from #Number
		where
			d >= 500
			and d < 700
	)
END 
-- end delete

select count( * )
from dbo.F_Line_item_Shipment
;

select *
from dbo.F_Line_item_Shipment
where
	F_Line_item_ShipmentID < 2000
order by
	F_Line_item_ShipmentID
;

alter table dbo.F_Line_item_Shipment
add constraint F_Line_item_Shipment_pk Primary Key Clustered ( F_Line_item_ShipmentID )
;

-- redo delete for another range
SET @cnt = 0 
WHILE @cnt < @cntmax
BEGIN
  SET @cnt = @cnt + 1
  DELETE TOP(@batch) FROM dbo.F_Line_item_Shipment 
  where
	F_Line_item_ShipmentID in (
		select d
		from #Number
		where
			d >= 1100
			and d < 1300
	)
END 

select count( * )
from dbo.F_Line_item_Shipment
;

select *
from dbo.F_Line_item_Shipment
where
	F_Line_item_ShipmentID < 2000
;

-- redo delete for another range
SET @cnt = 0 
WHILE @cnt < @cntmax
BEGIN
  SET @cnt = @cnt + 1
  DELETE FROM dbo.F_Line_item_Shipment 
  where
	F_Line_item_ShipmentID in (
		select top (@batch) s.F_Line_item_ShipmentID
		from dbo.F_Line_item_Shipment s

		where
			s.F_Line_item_ShipmentID >= 1500
			and s.F_Line_item_ShipmentID < 1700
		order by
			s.F_Line_item_ShipmentID asc
	)
END 

select count( * )
from dbo.F_Line_item_Shipment
;

select *
from dbo.F_Line_item_Shipment
where
	F_Line_item_ShipmentID < 2000
;

Open in new window

Avatar of richa1960

ASKER

On the select of a sample of a 1000 records, they're random based on a null entry
for the record having a parent entry in the fact table.

I took my original code and changed some things. I like what you sent I'd like to have two
options when its time to run this.

here are my changes. It parses ok but I think i'm missing something.

 use [Datamart] /* PUT YOUR DATABASE NAME HERE */
GO

DECLARE @batch INT
SET @batch = 10000 /* SIZE OF BATCH */
DECLARE @cnt INT
SET @cnt = 0 /* START OF COUNTER */
DECLARE @cntmax INT
SET @cntmax = 10; /* NUMBER OF BATCHES */

WITH CTE AS (
    Select
        D_Item_GroupSK
    FROM
        DWH.F_Line_item_Shipment
    UNION ALL
       Select
        D_Item_GroupSK
    FROM
        DWH.F_Line_item_MBL
)
SELECt
    top 1000 (D.D_Item_GroupSK)
FROM
    DWH.D_Item_Group D LEFT OUTER JOIN
    CTE C ON D.D_Item_GroupSK = C.D_Item_GroupSK
WHERE
    C.D_Item_GroupSK IS NULL

WHILE @cnt < @cntmax
BEGIN
  SET @cnt = @cnt + 1
  DELETE @batch
  FROM DWH.D_Item_Group D LEFT OUTER JOIN
    CTE C ON D.D_Item_GroupSK = C.D_Item_GroupSK WHERE C.D_Item_GroupSK IS NULL /* ACTUAL DELETE QUERY - ADJUST */
END
ASKER CERTIFIED SOLUTION
Avatar of David Todd
David Todd
Flag of New Zealand image

Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial
No, I'm going to see if i can grab a sub-set of the data and test it out
I've copy of set of the data from production to my test system. Having 2 issues, with the script you provided I still get the same record count when i run it to remove groups of the data based on the count i get from my cte script. The second issues is I'm getting an error with my other delete statement where its giving me an incorrect syntax near 'D' and i cant figure out why.

 use [Datamart] /* PUT YOUR DATABASE NAME HERE */
GO
DECLARE @batch INT
SET @batch = 20000 /* SIZE OF BATCH */
DECLARE @cnt INT
SET @cnt = 0 /* START OF COUNTER */
DECLARE @cntmax INT
SET @cntmax = 10 /* NUMBER OF BATCHES */
WHILE @cnt < @cntmax
BEGIN
  SET @cnt = @cnt + 1

   DELETE TOP(@batch) FROM  
     DWH.D_Item_Group D LEFT OUTER JOIN
     C.D_Item_GroupSK ON D_Item_GroupSK = C.D_Item_GroupSK
WHERE
    C.D_Item_GroupSK IS NULL /* ACTUAL DELETE QUERY - ADJUST */
END
Hi,

The left outer join is missing the table name. C.D_Item_GroupSK is the tablealias.columnName

HTH
  David
Was able to get
use Datamart
go

--start count

select count( * )
from DWH.F_Line_item_Shipment
;

-- delete
DECLARE @batch INT
SET @batch =1000 -- works in batchs set here
DECLARE @cnt INT
SET @cnt = 0 -- count setting
DECLARE @cntmax INT
SET @cntmax = 60 -- count max while running
WHILE @cnt < @cntmax
BEGIN
  SET @cnt = @cnt + 1
  DELETE TOP(@batch) FROM DWH.F_Line_item_Shipment
  where
      F_Line_Item_ShipmentSK in (select F_Line_Item_ShipmentSK from DWH.F_Line_Item_Shipment
            where F_Line_Item_ShipmentSK >= 1 and F_Line_Item_ShipmentSK < 5000 -- sets group to be removed based on id number
      )
END
-- end delete
-- end count
select count( * )
from DWH.F_Line_item_Shipment
;

-- confirm count range
select *
from DWH.F_Line_item_Shipment
where
      F_Line_Item_ShipmentSK < 5000
order by
      F_Line_Item_ShipmentSK
;

--alter table dbo.F_Line_item_ShipmentSK
--add constraint F_Line_item_Shipment_pk Primary Key Clustered ( F_Line_item_ShipmentSK )
;

-------- DOUBLE CHECK ------

SELECT F_Line_Item_ShipmentSK
  FROM DWH.F_Line_Item_Shipment
  where F_Line_Item_ShipmentSK between '1' and '5000'
the following to work.