Find set of duplicates within columns in same row and across rows

johnnyg123
johnnyg123 used Ask the Experts™
on
Here is some sample data for a sql server 2014 table


CREATE TABLE #License(
       [customer_id] [int] NULL,
       [license_key1] [nchar](10) NULL,
       [license_state1] [nchar](10) NULL,
       [license_key2] [nchar](10) NULL,
       [license_state2] [nchar](10) NULL,
       [license_key3] [nchar](10) NULL,
       [license_state3] [nchar](10) NULL
 ) ON [PRIMARY]

 INSERT INTO #License
            ([customer_id]
            ,[license_key1]
            ,[license_state1]
            ,[license_key2]
            ,[license_state2]
            ,[license_key3]
            ,[license_state3])
      VALUES
            (11111111
            ,12345    
            ,'IL'
            ,34567    
            ,'IN'
          ,12345    
            ,'IL')

 INSERT INTO #License
            ([customer_id]
            ,[license_key1]
            ,[license_state1]
            ,[license_key2]
            ,[license_state2]
            ,[license_key3]
            ,[license_state3])
      VALUES
            (22222222
            ,66666    
            ,'MI'
            ,999999    
            ,'NY'
          ,87654    
            ,'CO')

 INSERT INTO #License
            ([customer_id]
            ,[license_key1]
            ,[license_state1]
            ,[license_key2]
            ,[license_state2]
            ,[license_key3]
            ,[license_state3])
      VALUES
            (33333333
            ,908763    
            ,'MI'
            ,22263    
            ,'NY'
          ,135798    
            ,'IL')

                   INSERT INTO #License
            ([customer_id]
            ,[license_key1]
            ,[license_state1]
            ,[license_key2]
            ,[license_state2]
            ,[license_key3]
            ,[license_state3])
      VALUES
            (44444444
            ,98765    
            ,'MI'
            ,4576  
            ,'NY'
          ,135798    
            ,'IL')

As you can see there are 3 sets of license key and state combinations

I want one query that will return any rows that have  a duplicate set in the 3 sets across row along with the combination that is the duplicate


Another query to identify a match in set values in any of the sets in any rows along with the combination that is the duplicate


(Note: it is possible that the license key could be alpha or alpha numeric)

Given above data

The query looking at combinations across columns in a row should return

customer_id       licensekey   licensestate
11111111            12345          IL


The query looking at combinations across rows should return

customer_id       licensekey   licensestate
33333333           135798          IL
44444444           135798          IL
Comment
Watch Question

Do more with

Expert Office
EXPERT OFFICE® is a registered trademark of EXPERTS EXCHANGE®
Hi Johnny,

I have 2 queries below. It would be good to test them over more data than just the 4 rows of license data provided.
Have a go and see if it is close.
Thanks
Tony


CREATE TABLE #License(
       [customer_id] [int] NULL,
       [license_key1] [nchar](10) NULL,
       [license_state1] [nchar](10) NULL,
       [license_key2] [nchar](10) NULL,
       [license_state2] [nchar](10) NULL,
       [license_key3] [nchar](10) NULL,
       [license_state3] [nchar](10) NULL
 ) ON [PRIMARY]


 INSERT INTO #License
            ([customer_id]
            ,[license_key1]
            ,[license_state1]
            ,[license_key2]
            ,[license_state2]
            ,[license_key3]
            ,[license_state3])
      VALUES
            (11111111
            ,12345     
            ,'IL'
            ,34567     
            ,'IN'
          ,12345     
            ,'IL')

 INSERT INTO #License
            ([customer_id]
            ,[license_key1]
            ,[license_state1]
            ,[license_key2]
            ,[license_state2]
            ,[license_key3]
            ,[license_state3])
      VALUES
            (22222222
            ,66666     
            ,'MI'
            ,999999     
            ,'NY'
          ,87654     
            ,'CO')

 INSERT INTO #License
            ([customer_id]
            ,[license_key1]
            ,[license_state1]
            ,[license_key2]
            ,[license_state2]
            ,[license_key3]
            ,[license_state3])
      VALUES
            (33333333
            ,908763     
            ,'MI'
            ,22263     
            ,'NY'
          ,135798    
            ,'IL')

                   INSERT INTO #License
            ([customer_id]
            ,[license_key1]
            ,[license_state1]
            ,[license_key2]
            ,[license_state2]
            ,[license_key3]
            ,[license_state3])
      VALUES
            (44444444
            ,98765     
            ,'MI'
            ,4576   
            ,'NY'
          ,135798    
            ,'IL')
            
            
SELECT *
FROM #License

--QUERY1
SELECT customer_id, license_key1 as license_key, license_state1 as license_state
FROM #License
WHERE (license_key1 = license_key2 or license_key1 = license_key3)
AND (license_state1 = license_state2 or license_state1 = license_state3)

--QUERY2
SELECT Customer_ID, LicenseKey, LicenseState
FROM 
(SELECT license_key1 as LicenseKey, license_state1 AS LicenseState, Count(license_key1) as Num1,  Count(license_state1) as State1
FROM #License
GROUP BY license_key1, license_state1
HAVING Count(license_key1) >1 OR Count(license_state1) >1
UNION
SELECT license_key2  as LicenseKey, license_state2 AS LicenseState, Count(license_key2) as Num2,  Count(license_state2) as State2
FROM #License
GROUP BY license_key2, license_state2
HAVING Count(license_key2) >1 OR Count(license_state2) >1
UNION
SELECT license_key3  as LicenseKey, license_state3 AS LicenseState, Count(license_key3) as Num3,  Count(license_state3) as State3
FROM #License
GROUP BY license_key3, license_state3
HAVING Count(license_key3) >1 OR Count(license_state3) >1) a
JOIN #License li
ON (a.LicenseKey = li.license_key1
OR a.LicenseKey = li.license_key2
OR a.LicenseKey = li.license_key3)

Open in new window

Author

Commented:
Hi Tony

Thanks so much for your response!

I really so not want to sound unappreciative because I am

Query 1 works just fine

However

Query 2 assumes the duplicates are in the same license_key and license_state group which is not the case
For example,

customer_id   license_key1  license_state1  license_key2  license_state2  license_key3  license_state3
111111111     12345              IL                        67897               NY                      77777              IL
222222222     12345              IL                        6789                 NY                      777777            IL

The duplicate would be found across rows because  for both customer ids the dups are in license_key1 and license_state1

However
The query would not find the dup given the following data

customer_id   license_key1  license_state1  license_key2  license_state2  license_key3  license_state3
111111111     12345              IL                        67897               NY                      77777              IL
222222222      44444              IL                       12345               IL                        88888              IL

The duplicate set of data is in license_key1 and license_state1 for customer id 111111111

The duplicate set of data is in license_key2 and license_state2 for customer id 222222222

Also,  I only included 3 groups in the sample data to try and make it a little cleaner.  However, there are actually 5 groups
Trying to account for all the combinations in the first where clause is a bit confusing
Senior DBA
Most Valuable Expert 2018
Top Expert 2014
Commented:
This will handle up to 3 dup cust id / lic key / lic state combinations.  If there can be more than three, the code would have a lot more overhead.  The min and max customer_id would still be correct, but there would be no third value shown.

Btw, you can get rid of all the complexity of this query by normalizing the structure so that there is only one license_key, license_state combination per row.
SELECT ca1.customer_id, license_key, license_state, customer_count AS duplicates_count
FROM (
    SELECT licenses.license_key, licenses.license_state,
        COUNT(DISTINCT licenses.customer_id) AS customer_count,
        MIN(licenses.customer_id) AS customer_id_min,
        CASE WHEN COUNT(DISTINCT licenses.customer_id) = 3 THEN 
            SUM(licenses.customer_id) - MIN(licenses.customer_id) - MAX(licenses.customer_id) 
            ELSE NULL END AS customer_id_middle,
        MAX(licenses.customer_id) AS customer_id_max
    FROM #License
    CROSS APPLY (
        VALUES(customer_id, license_key1, license_state1),
              (customer_id, license_key2, license_state2),
              (customer_id, license_key3, license_state3) 
    ) AS licenses(customer_id, license_key, license_state)
    WHERE licenses.license_key > '' AND licenses.license_state > ''
    GROUP BY license_key, license_state
    HAVING COUNT(*) > 1 AND COUNT(DISTINCT licenses.customer_id) > 1
) AS derived
CROSS APPLY (
    SELECT customer_id_min AS customer_id
    UNION ALL
    SELECT customer_id_middle
    WHERE customer_id_middle IS NOT NULL
    UNION ALL
    SELECT customer_id_max
) AS ca1
ORDER BY license_state, license_key, customer_id

Open in new window

Author

Commented:
Thanks for all the responses!

Tony,

Thanks so much for taking the time for your response

Hi Scott

Couldn't agree more about normalizing the vendor file

We are trying to help them figure out some issues so for now kinda stuck

Thanks so much for the solution!

Author

Commented:
Hi Scott

Couldn't agree more about normalizing the vendor file

We are trying to help them figure out some issues so for now kinda stuck

Thanks so much for the solution!

Do more with

Expert Office
Submit tech questions to Ask the Experts™ at any time to receive solutions, advice, and new ideas from leading industry professionals.

Start 7-Day Free Trial