Let's create some sample data to play with first.
SET NOCOUNT ON;
IF object_id('dbo.testEnvironment') IS NOT NULL
BEGIN
DROP TABLE dbo.testEnvironment;
END;
WITH t1(N) AS (SELECT 1 UNION ALL SELECT 1),
t2(N) AS (SELECT 1 FROM t1 x, t1 y),
t3(N) AS (SELECT 1 FROM t2 x, t2 y),
Tally(N) AS (SELECT TOP 98 ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM t3 x, t3 y),
Tally2(N) AS (SELECT TOP 5 ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM t3 x, t3 y),
Combinations(N) AS (SELECT LTRIM(RTRIM(RTRIM(SUBSTRING(poss,a.N,2)) + SUBSTRING(vowels,b.N,1)))
FROM Tally a
CROSS JOIN Tally2 b
CROSS APPLY (SELECT 'B C D F G H J K L M N P R S T V W Z SCSKKNSNSPSTBLCLFLGLPLSLBRCRDRFRGRPRTRVRSHSMGHCHPHRHWHBWCWSWTW') d(poss)
CROSS APPLY (SELECT 'AEIOU') e(vowels)),
Words (N) AS (SELECT a.N + b.N
FROM Combinations a
CROSS JOIN Combinations b)
SELECT IDENTITY(INT,1,1) AS ID, a.N + b.N AS randomName,
RAND(CHECKSUM(NEWID())) * 30000 /*(Number of days in range)*/ + CAST('1945' AS DATETIME) /*(Start date, e.g. '1945-01-01 00:00:00*/ AS randomDateTime,
DATEADD(DAY,((ABS(CHECKSUM(NEWID())) % 366 /*(Number of days in range)*/) + 1),CAST('1945' AS DATE) /*(Start date, e.g. '1945-01-01*/) AS randomDate,
ABS(CHECKSUM(NEWID())) AS randomBigInt,
(ABS(CHECKSUM(NEWID())) % 100) + 1 AS randomSmallInt,
RAND(CHECKSUM(NEWID())) * 100 AS randomSmallDec,
RAND(CHECKSUM(NEWID())) AS randomTinyDec,
RAND(CHECKSUM(NEWID())) * 100000 AS randomBigDec,
CONVERT(VARCHAR(6),CONVERT(MONEY,RAND(CHECKSUM(NEWID())) * 100),0) AS randomMoney
INTO dbo.testEnvironment
FROM (SELECT TOP 1000 N
FROM Words
ORDER BY NEWID()) a
CROSS JOIN (SELECT TOP 1000 N
FROM Words
ORDER BY NEWID()) b;
SELECT COUNT(*)
FROM dbo.testEnvironment;
OK, so now we've got 1 million rows, so let's have a look at performance.
First, with no indexes.
SET STATISTICS IO ON;
SET STATISTICS TIME ON;
SELECT COUNT(*)
FROM dbo.testEnvironment
WHERE SQL#.RegEx_IsMatch(randomName,'[A-C][A-F][P-Z]',1,'IgnoreCase')>0
SET STATISTICS TIME OFF;
SET STATISTICS IO OFF;
The above returns: -
-----------
104165
Table 'testEnvironment'. Scan count 5, logical reads 9730, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Worktable'. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
SQL Server Execution Times:
CPU time = 46233 ms, elapsed time = 12223 ms.
Let's have a look at the LIKE equivalent: -
SET STATISTICS IO ON;
SET STATISTICS TIME ON;
SELECT COUNT(*)
FROM dbo.testEnvironment
WHERE UPPER(randomName) LIKE '%[A-C][A-F][P-Z]%'
SET STATISTICS TIME OFF;
SET STATISTICS IO OFF;
What do you think? Slower or faster?
-----------
104165
Table 'testEnvironment'. Scan count 5, logical reads 9730, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
SQL Server Execution Times:
CPU time = 954 ms, elapsed time = 245 ms.
Whoops, not what we were expecting! 😛
OK, the PATINDEX equivalent?
SET STATISTICS IO ON;
SET STATISTICS TIME ON;
SELECT COUNT(*)
FROM dbo.testEnvironment
WHERE PATINDEX('%[A-C][A-F][P-Z]%',UPPER(randomName))>0
SET STATISTICS TIME OFF;
SET STATISTICS IO OFF;
As expected, pretty much identical to LIKE.
-----------
104165
Table 'testEnvironment'. Scan count 5, logical reads 9730, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
SQL Server Execution Times:
CPU time = 922 ms, elapsed time = 242 ms.
OK, let's add a couple of indexes.
CREATE CLUSTERED INDEX cidx_testEnvironment ON dbo.testEnvironment (ID);
CREATE NONCLUSTERED INDEX ncidx_testEnvironment ON dbo.testEnvironment (randomName);
Try again. . .
SET STATISTICS IO ON;
SET STATISTICS TIME ON;
SELECT COUNT(*)
FROM dbo.testEnvironment
WHERE SQL#.RegEx_IsMatch(randomName,'[A-C][A-F][P-Z]',1,'IgnoreCase')>0
SET STATISTICS TIME OFF;
SET STATISTICS IO OFF;
Returns: -
-----------
104165
Table 'testEnvironment'. Scan count 1, logical reads 3046, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
SQL Server Execution Times:
CPU time = 42656 ms, elapsed time = 45748 ms.
Oh dear, it's worse. OK, how about LIKE?
SET STATISTICS IO ON;
SET STATISTICS TIME ON;
SELECT COUNT(*)
FROM dbo.testEnvironment
WHERE UPPER(randomName) LIKE '%[A-C][A-F][P-Z]%'
SET STATISTICS TIME OFF;
SET STATISTICS IO OFF;
Returns: -
-----------
104165
Table 'testEnvironment'. Scan count 1, logical reads 3046, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
SQL Server Execution Times:
CPU time = 1000 ms, elapsed time = 1004 ms.
Also worse. . . and PATINDEX?
SET STATISTICS IO ON;
SET STATISTICS TIME ON;
SELECT COUNT(*)
FROM dbo.testEnvironment
WHERE PATINDEX('%[A-C][A-F][P-Z]%',UPPER(randomName))>0
SET STATISTICS TIME OFF;
SET STATISTICS IO OFF;
Returns: -
-----------
104165
Table 'testEnvironment'. Scan count 1, logical reads 3046, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
SQL Server Execution Times:
CPU time = 938 ms, elapsed time = 925 ms.
Same as LIKE again.
Any disputes about my test results? Because so far I'm thinking that I don't want RegEx in my database 😛