Delete all subset records
Today I helped a guy out with a special request. His goal was to remove all records that are subsets or parts of another record, a superset record.
See this sample data
DECLARE @Sample TABLE
(
recID INT IDENTITY(1, 1),
col1 VARCHAR(1),
col2 VARCHAR(2),
col3 VARCHAR(3),
userID INT
)
INSERT @Sample
SELECT 'A', 'B', 'C', 1 UNION ALL
SELECT 'A', 'B', ' ', 1 UNION ALL
SELECT 'A', ' ', 'C', 1 UNION ALL
SELECT 'F', ' ', 'C', 1 UNION ALL
SELECT ' ', 'M', ' ', 2 UNION ALL
SELECT 'T', 'M', 'O', 2 UNION ALL
SELECT ' ', 'M', 'O', 2 UNION ALL
SELECT 'X', 'M', 'O', 2 UNION ALL
SELECT ' ', ' ', 'Y', 3 UNION ALL
SELECT ' ', 'X', 'Y', 3
SELECT *
FROM @Sample
recID col1 col2 col3 userID
1 A B C 1
2 A B 1
3 A C 1
4 F C 1
5 M 2
6 T M O 2
7 M O 2
8 X M O 2
9 Y 3
10 X Y 3
Records 2 and 3 are subsets of Record 1, so they should be deleted.
Record 5 is a subset of Records 6, 7 and 8, so it should be deleted.
Record 7 is a subset of Records 6 and 8, so it should be deleted.
(
recID INT IDENTITY(1, 1),
col1 VARCHAR(1),
col2 VARCHAR(2),
col3 VARCHAR(3),
userID INT
)
INSERT @Sample
SELECT 'A', 'B', 'C', 1 UNION ALL
SELECT 'A', 'B', ' ', 1 UNION ALL
SELECT 'A', ' ', 'C', 1 UNION ALL
SELECT 'F', ' ', 'C', 1 UNION ALL
SELECT ' ', 'M', ' ', 2 UNION ALL
SELECT 'T', 'M', 'O', 2 UNION ALL
SELECT ' ', 'M', 'O', 2 UNION ALL
SELECT 'X', 'M', 'O', 2 UNION ALL
SELECT ' ', ' ', 'Y', 3 UNION ALL
SELECT ' ', 'X', 'Y', 3
SELECT *
FROM @Sample
recID col1 col2 col3 userID
1 A B C 1
2 A B 1
3 A C 1
4 F C 1
5 M 2
6 T M O 2
7 M O 2
8 X M O 2
9 Y 3
10 X Y 3
Records 2 and 3 are subsets of Record 1, so they should be deleted.
Record 5 is a subset of Records 6, 7 and 8, so it should be deleted.
Record 7 is a subset of Records 6 and 8, so it should be deleted.
Record 9 is a subset of Record 10, so it should be deleted.
Remember that records are to be kept with userID.
This is my suggestion how deal with all subset deletes.
DELETE s
FROM (
SELECT recID,
col1,
col2,
col3,
userID,
CASE WHEN col1 = '' THEN 0 ELSE 1 END
+ CASE WHEN col2 = '' THEN 0 ELSE 1 END
+ CASE WHEN col3 = '' THEN 0 ELSE 1 END AS theItems
FROM @Sample
) AS s
INNER JOIN (
SELECT MIN(recID) AS minID,
col1,
col2,
col3,
userID,
CASE WHEN col1 = '' THEN 0 ELSE 1 END
+ CASE WHEN col2 = '' THEN 0 ELSE 1 END
+ CASE WHEN col3 = '' THEN 0 ELSE 1 END AS theItems
FROM @Sample
GROUP BY col1,
col2,
col3,
userID,
CASE WHEN col1 = '' THEN 0 ELSE 1 END
+ CASE WHEN col2 = '' THEN 0 ELSE 1 END
+ CASE WHEN col3 = '' THEN 0 ELSE 1 END
) AS w ON w.userID = s.userID
AND w.minID <> s.recID
AND w.theItems >= s.theItems
WHERE w.col1 = COALESCE(NULLIF(s.col1, ''), w.col1)
AND w.col2 = COALESCE(NULLIF(s.col2, ''), w.col2)
AND w.col3 = COALESCE(NULLIF(s.col3, ''), w.col3)
SELECT *
FROM @Sample
Remember that records are to be kept with userID.
This is my suggestion how deal with all subset deletes.
DELETE s
FROM (
SELECT recID,
col1,
col2,
col3,
userID,
CASE WHEN col1 = '' THEN 0 ELSE 1 END
+ CASE WHEN col2 = '' THEN 0 ELSE 1 END
+ CASE WHEN col3 = '' THEN 0 ELSE 1 END AS theItems
FROM @Sample
) AS s
INNER JOIN (
SELECT MIN(recID) AS minID,
col1,
col2,
col3,
userID,
CASE WHEN col1 = '' THEN 0 ELSE 1 END
+ CASE WHEN col2 = '' THEN 0 ELSE 1 END
+ CASE WHEN col3 = '' THEN 0 ELSE 1 END AS theItems
FROM @Sample
GROUP BY col1,
col2,
col3,
userID,
CASE WHEN col1 = '' THEN 0 ELSE 1 END
+ CASE WHEN col2 = '' THEN 0 ELSE 1 END
+ CASE WHEN col3 = '' THEN 0 ELSE 1 END
) AS w ON w.userID = s.userID
AND w.minID <> s.recID
AND w.theItems >= s.theItems
WHERE w.col1 = COALESCE(NULLIF(s.col1, ''), w.col1)
AND w.col2 = COALESCE(NULLIF(s.col2, ''), w.col2)
AND w.col3 = COALESCE(NULLIF(s.col3, ''), w.col3)
SELECT *
FROM @Sample
Legacy Comments
Casual Observer
2008-10-15 |
re: Delete all subset records I guess I'm missing something... DELETE a FROM @Sample a WHERE EXISTS (SELECT * FROM @Sample b WHERE a.userID = b.userID AND a.recID <> b.recID AND (a.col1 = b.col1 OR a.col1 = '') AND (a.col2 = b.col2 OR a.col2 = '') AND (a.col3 = b.col3 OR a.col3 = '')) SELECT * FROM @Sample BTW - your comment about 9 and 10 should be the other way round I think. |
Peso
2008-10-15 |
re: Delete all subset records Fixed. Thank you. The reason with using the "sum of value columns" is to have a triangular join ( n * m / 2) instead of a cross join (n * m). |
Peso
2008-10-15 |
re: Delete all subset records However, your suggestion needs to be changed to this DELETE a FROM @Sample AS a INNER JOIN ( SELECT MIN(recID) AS recID, col1, col2, col3, userID FROM @Sample GROUP BY col1, col2, col3, userID ) AS b ON b.userID = a.userID AND b.recID <> a.recID WHERE (a.col1 = b.col1 OR a.col1 = '') AND (a.col2 = b.col2 OR a.col2 = '') AND (a.col3 = b.col3 OR a.col3 = '') for taking care of pure duplicates too. |