RE: do you have a StripNonNumeric ITVF function?

SSC Guru

Points: 61136

June 25, 2014 at 11:23 am

Gents

First: Dang it Sean - you beet me to it! I had a very similar solution but you posted yours first. That said, I did do a performance test and ran into something I cannot figure out.

on my laptop (Windows 8, 2 cores/4 logical cpu/16GB run SQL Server 2014 Ent) I ran your solution (with results on) and it takes 31 seconds complete (my test harness data below). I tried to improve the time in your query but had no success.

I have an ngrams function that I use for this kind of thing. Below is the function with my notes on how to use it:

IF OBJECT_ID('tempdb.dbo.nGrams8K') IS NOT NULL DROP FUNCTION dbo.nGrams8K

GO

CREATE FUNCTION dbo.nGrams8K (@string varchar(8000), @k int)

/********************************************************************

Created by: Alan Burstein

Created on: 3/10/2014

Last Updated on: 5/22/2015

n-gram defined:

In the fields of computational linguistics and probability,

an n-gram is a contiguous sequence of n items from a given

sequence of text or speech. The items can be phonemes, syllables,

letters, words or base pairs according to the application.

For more information see: http://en.wikipedia.org/wiki/N-gram

Use:

Outputs a stream of tokens based on an input string.

Similar to mdq.nGrams:

http://msdn.microsoft.com/en-us/library/ff487027(v=sql.105).aspx.

Except it only returns characters as long as K.

nGrams8K also includes the position of the "Gram" in the string.

Examples of how to use included as comments after the code.

********************************************************************/

RETURNS TABLE

WITH SCHEMABINDING

AS

RETURN

WITH

E1(N) AS (SELECT 1 FROM (VALUES (null),(null),(null),(null),(null)) x(n)),

E3(N) AS (SELECT 1 FROM E1 a CROSS JOIN E1 b CROSS JOIN E1 c),

iTally(N) AS

(

SELECT TOP (LEN(@string)-(@k-1)) CHECKSUM(ROW_NUMBER() OVER (ORDER BY (SELECT NULL)))

FROM E3 a CROSS JOIN E3 b

)

SELECT

position = N,

token = SUBSTRING(@string,N,@k)

FROM iTally;

GO

/********************************************************************

(1) Basic Use

-----------------------------------------------------------

-- (A) With @k as 1,2 & 3:

SELECT position, token FROM dbo.nGrams8K('abcdefg',1);

SELECT position, token FROM dbo.nGrams8K('abcdefg',2);

SELECT position, token FROM dbo.nGrams8K('abcdefg',3);

-- (B) Using variables

DECLARE

@string varchar(20) = 'abcdefg12345', @tokenLen tinyint = 3;

SELECT position, token FROM dbo.nGrams8K(@string,@tokenLen)

GO

(2) Character Count (including 0 Counts)

-----------------------------------------------------------

-- (A) Basic character count

SELECT token = ISNULL(token,'Total:'), count(*)

FROM dbo.nGrams8K(newid(),1)

GROUP BY token

WITH ROLLUP;-- Added rollup for display

-- (B) Character Account including gaps

DECLARE

@alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ',

@string VARCHAR(100)='The quick green fox jumps over the lazy dog and the lazy dogs.';

SELECT a.token, COUNT(b.token) ttl

FROM dbo.nGrams8K(@alphabet,1) a

LEFT JOIN dbo.nGrams8K(@string,1) b ON a.token=b.token

GROUP BY a.token

ORDER BY a.token; -- not required, for display only

GO

-- (C) Let's try French ordered by most frequent

DECLARE

@alphabet VARCHAR(36)='abcdefghijklmnopqrstuvwxyzéèçëòôöùàâ',

@string VARCHAR(100)='Le renard vert rapide saute par dessus le chien paresseux et le chien paresseux juste posé là.';

WITH charcount AS

(

SELECT a.token, COUNT(b.token) ttl

FROM dbo.nGrams8K(@alphabet,1) a

LEFT JOIN dbo.nGrams8K(@string,1) b ON a.token=b.token

GROUP BY a.token

)

SELECT * FROM charcount ORDER BY ttl DESC;

GO

(3) a *SIMPLE* word count

-----------------------------------------------------------

DECLARE @string varchar(100)='THE QUICK Green FOX JUMPED OVER THE LAZY DOGS BACK';

SELECT @string AS string, count(*)+1 AS words

FROM dbo.nGrams8K(@string,1)

WHERE [token]=' '

GO

(4) search for occurances and location of a substring

-----------------------------------------------------------

DECLARE

@string VARCHAR(100)='The green fox jumps over the dog and the dog just laid there.',

@searchString VARCHAR(100)='The';

SELECT position, token AS searchString

FROM dbo.nGrams8K(@string,LEN(@searchstring)) b

WHERE token=@searchString;

GO

(5) Strip non-numeric characters from a string

-----------------------------------------------------------

DECLARE @string varchar(100) = 'abc123xyz555xxx999!';

WITH stripNonNumeric(cleanstring) AS

(

SELECT token+''

FROM dbo.nGrams8K(@string,1)

WHERE token LIKE '[0-9]'

FOR XML PATH('')

)

SELECT cleanstring

FROM stripNonNumeric;

(6) Find all occurances of a pattern in a string

------------------------------------------------------------

DECLARE

@string varchar(100) = 'zz12x345xxx555abc1234zz5xxx',

@pat varchar(100) = '[a-z][a-z][0-9]',

@len int = 3;

SELECT start_pos = position, token

FROM dbo.nGrams8K(@string,@len)

WHERE token LIKE @pat

SELECT * FROM dbo.findpat8k(@string,@pat,3);

GO

(7) Find the longest common substring between 2 strings

------------------------------------------------------------

-- (A) The Function

IF EXISTS

(

SELECT * FROM INFORMATION_SCHEMA.ROUTINES

WHERE ROUTINE_SCHEMA='dbo' AND ROUTINE_NAME = 'lcss'

)

DROP FUNCTION dbo.lcss;

GO

CREATE FUNCTION dbo.lcss(@string1 varchar(100), @string2 varchar(100))

RETURNS TABLE WITH SCHEMABINDING AS RETURN

WITH iTally(N) AS

(

SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL))

FROM dbo.nGrams8K

(

CASE WHEN LEN(@string1)<=LEN(@string2) THEN @string1

ELSE @string2 END,1

)

SELECT TOP (1) with ties token

FROM iTally

CROSS APPLY

dbo.nGrams8K

(

CASE WHEN LEN(@string1)<=LEN(@string2) THEN @string1

ELSE @string2 END, n

)

WHERE N <=

CASE WHEN LEN(@string1)<=LEN(@string2) THEN LEN(@string1)

ELSE LEN(@string2) END

AND charindex

(

token,

CASE WHEN LEN(@string1)<=LEN(@string2) THEN @string2

ELSE @string1 END

) > 0

ORDER BY len(token) DESC;

GO

-- (B) example of how to use lcss

DECLARE

@string1 varchar(100) = '999xxyyyzaa99xxyyyzz',

@string2 varchar(100) = '000xxxyyyzzz';

SELECT string1 = @string1, string2 = @string2, token

FROM dbo.lcss(@string1, @string2);

********************************************************************/

GO

I ran the following 10K Row Test:

SET NOCOUNT ON;

DBCC FREEPROCCACHE WITH NO_INFOMSGS

DBCC DROPCLEANBUFFERS WITH NO_INFOMSGS

SET STATISTICS IO ON;

SET STATISTICS TIME ON;

PRINT 'Sean'+char(13);

SELECT CleanedText

FROM #strings

CROSS APPLY dbo.StripNonNumeric_itvf(string);

PRINT 'Alan'+char(13);

SELECT CleanedText

FROM #strings

CROSS APPLY dbo.StripNonNumeric_itvf_ajb(string);

SET STATISTICS IO OFF;

SET STATISTICS TIME OFF;

... and here's the results:

Sean

SQL Server Execution Times:

CPU time = 0 ms, elapsed time = 0 ms.

Table '#strings____________________________________________________________________________________________________________00000000004C'.

Scan count 1, logical reads 63, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

SQL Server Execution Times:

CPU time = 31172 ms, elapsed time = 31213 ms.

Alan

SQL Server Execution Times:

CPU time = 0 ms, elapsed time = 0 ms.

Table '#strings____________________________________________________________________________________________________________00000000004C'.

Scan count 1, logical reads 63, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

SQL Server Execution Times:

CPU time = 328 ms, elapsed time = 384 ms.

SQL Server Execution Times:

CPU time = 0 ms, elapsed time = 0 ms.

I have no idea why my NGrams solution is so much faster considering that we are essentially doing the same thing. I included the query plans for both.

Lowell: I hope this helps.

"I cant stress enough the importance of switching from a sequential files mindset to set-based thinking. After you make the switch, you can spend your time tuning and optimizing your queries instead of maintaining lengthy, poor-performing code."

-- Itzik Ben-Gan 2001