String tokenizing / splitting

Question

Post reply

String tokenizing / splitting

Irfan.Baig

SSC-Addicted

Points: 420
More actions
February 24, 2011 at 8:56 pm

#232653

Comments posted to this topic are about the item String tokenizing / splitting

Viewing 15 posts - 1 through 15 (of 26 total)

You must be logged in to reply to this topic. Login to reply

davecason SSC-Addicted Points: 411 More actions · Answer 1

I am fairly certain that CTE will be faster...

/*sql server 2008 CTE split and join fun*/

/*split begins*/

if object_id(N'tempdb..#split') is not null drop table #split;

if object_id(N'tempdb..#joined') is not null drop table #joined;

declare @fun varchar(64) = 'The quick brown fox jumped over the lazy dogs!';

declare @delim char(1) = ' ';

select @fun as [Fun];

with split(i, token, remainder) as

(select 1

, left(@fun,charindex(@delim,@fun)-1)

, LTRIM(right(@fun,len(@fun)-CHARINDEX(@delim,@fun)))

union all

select i + 1

,case when charindex(@delim,remainder) > 0 then

left(remainder,charindex(@delim,remainder)-1)

else remainder end as token

,LTRIM(right(remainder,len(remainder)-CHARINDEX(' ',remainder))) as remainder

from split

where charindex(@delim,remainder) >= 0 and token != remainder

)

select i, token, remainder

into #split

from split;

select * from #split;

/*join begins*/

with joined (i, newfun, token) as (

select i, convert(varchar(max),token), token

from #split where i = 1

union all

select s.i, j.newfun + @delim + s.token, s.token

from joined j

inner join

#split s

on s.i = j.i + 1

)

select *

into #joined

from joined;

select * from #joined;

dimitri.decoene-1027745 SSC Veteran Points: 284 More actions · Answer 2

Another way to do it is by using the Tally table. Check out Jeff Moden's article http://www.sqlservercentral.com/articles/T-SQL/62867/.

Here's some example code (how i use it on our systems):

SET ANSI_NULLS ON

GO

SET QUOTED_IDENTIFIER ON

GO

CREATE FUNCTION [dbo].[udfTransformStringToColumn] (

@StringVARCHAR(MAX),

@DelimiterVARCHAR(1)

)

RETURNS @List TABLE (ValueColumn VARCHAR(255) NULL)

AS

BEGIN

IF LEFT(@String, 1) <> @Delimiter SET @String = @Delimiter + @String

IF RIGHT(@String, 1) <> @Delimiter SET @String = @String + @Delimiter

;WITH

cteTally AS

(

SELECTTOP (LEN(@String))

ROW_NUMBER() OVER (ORDER BY t1.Object_ID) AS N

FROMMaster.sys.All_Columns t1

CROSS JOIN Master.sys.All_Columns t2

)INSERT INTO @List (ValueColumn)

SELECTSUBSTRING(@String,N+1,CHARINDEX(@Delimiter,@String,N+1)-N-1) AS Value

FROMcteTally

WHEREN < LEN(@String)

AND SUBSTRING(@String,N,1) = @Delimiter

RETURN

END

GO

Craig Sunderland SSC Enthusiast Points: 153 More actions · Answer 3

I find the below faster on 2005.

[font="Courier New"]

CREATE FUNCTION [Strings].[fnStringSplit_2005_Distinct_CHAR]

(

@SourceString VARCHAR(MAX)

)

/*======================================================================================

'P System : Multiple

'P Subsystem : Common Functions

'P Script : fnSMS_StringSplit_2005

'P Creation date : 15/10/2010

'P

'P Description : Splits a Comma Delimited String Into a Table. Join as a Table

'P

'P SELECT * FROM Strings.fnStringSplit_2005_Distinct_CHAR('9,8,7,6,5,4,3,2,1,')

'P

'P Parameters----------------------------------------------------------------------

'P Inputs : @SourceString - Comma delimited string

'P Outputs : table variable

'P====================================================================================*/

RETURNS @Values TABLE

(

--POSITION INT IDENTITY,

VALUE VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS

PRIMARY KEY ([VALUE])

)

AS

BEGIN

DECLARE @vchString VARCHAR(MAX)

DECLARE @xmlString XML

BEGIN

SET @vchString = @SourceString

IF RIGHT(@vchString,1) = ','

SET @vchString = LEFT(@vchString, LEN(@vchString)-1)

/*------------------------------------------

| Convert the string to xml

'-----------------------------------------*/

SET @xmlString = CAST('' + REPLACE(@vchString, ',', '') + '' AS XML)

/*------------------------------------------

| Read xml into a table variable

'-----------------------------------------*/

INSERT INTO @Values(VALUE)

SELECT DISTINCT x.i.value('.', 'VARCHAR(255)') AS Item

FROM @xmlString.nodes('//i') x(i)

END

RETURN

END

/*---------------------------------------------------------------------------------------

|--------------------------- End of common function -----------------------------|

|--------------------------------------------------------------------------------------*/[/font]

Cheap toilet paper is a false economy, beware!

AAYakovenko SSC Enthusiast Points: 125 More actions · Answer 4

Small Code for 2008 SQL. )

CREATE FUNCTION dbo.fn_Split

(

@InputString VARCHAR(MAX),

@Delimiter VARCHAR(MAX)

)

RETURNS @data TABLE (

Data VARCHAR(MAX)

)

AS

BEGIN

DECLARE @DelimitierLen BIGINT = LEN(@Delimiter)

DECLARE @DelimiterCharIndex BIGINT = CHARINDEX(@Delimiter,@InputString)

WHILE (@DelimiterCharIndex > 0)

BEGIN

INSERT INTO @data VALUES (SUBSTRING(@InputString, 1, @DelimiterCharIndex - 1))

SET @InputString = SUBSTRING(@InputString, @DelimiterCharIndex + @DelimitierLen, LEN(@InputString))

SET @DelimiterCharIndex = CHARINDEX(@Delimiter, @InputString)

END

INSERT INTO @data VALUES (@InputString)

RETURN

END

AAYakovenko SSC Enthusiast Points: 125 More actions · Answer 5

Fast, but not good. (

DECLARE @inputString varchar(MAX) = '1;2;4;5;'

Result of exec is 'XML parsing: line 1, character 97257, unexpected end of input'.

You need check input string.

gorr-688214 SSC Enthusiast Points: 187 More actions · Answer 6

CREATE FUNCTION [dbo].[Split](@data VARCHAR(MAX), @delimiter VARCHAR(20))

RETURNS @t TABLE (Element VARCHAR(MAX))

AS

BEGIN

DECLARE @textXML XML

SET @textXML = CAST('<d>' + REPLACE(REPLACE(REPLACE(@data,'&','~~amper~~'),'<','~~lt~~'), @delimiter, '</d><d>') + '</d>' AS XML)

INSERT INTO @t(element)

SELECT REPLACE(REPLACE(T.split.value('.', 'varchar(max)'),'~~amper~~','&'), '~~lt~~','<') AS data

FROM @textXML.nodes('/d') T(split)

RETURN

END

nadabadan SSCommitted Points: 1503 More actions · Answer 7

dimitri.decoene-1027745 (2/25/2011)
Another way to do it is by using the Tally table. Check out Jeff Moden's article http://www.sqlservercentral.com/articles/T-SQL/62867/.
Here's some example code (how i use it on our systems):
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE FUNCTION [dbo].[udfTransformStringToColumn] (
@StringVARCHAR(MAX),
@DelimiterVARCHAR(1)
)
RETURNS @List TABLE (ValueColumn VARCHAR(255) NULL)
AS
BEGIN
IF LEFT(@String, 1) <> @Delimiter SET @String = @Delimiter + @String
IF RIGHT(@String, 1) <> @Delimiter SET @String = @String + @Delimiter
;WITH
cteTally AS
(
SELECTTOP (LEN(@String))
ROW_NUMBER() OVER (ORDER BY t1.Object_ID) AS N
FROMMaster.sys.All_Columns t1
CROSS JOIN Master.sys.All_Columns t2
)INSERT INTO @List (ValueColumn)
SELECTSUBSTRING(@String,N+1,CHARINDEX(@Delimiter,@String,N+1)-N-1) AS Value
FROMcteTally
WHEREN < LEN(@String)
AND SUBSTRING(@String,N,1) = @Delimiter
RETURN
END
GO

Interesting function. However, you may want to always append a delimiter to the string because the function would not return the correct number of beginning empty strings. Change

IF LEFT(@String, 1) <> @Delimiter SET @String = @Delimiter + @String

TO

SET @String = @Delimiter + @String

SELECT *

FROM [dbo].[udfTransformStringToColumn]( ';;;Token1;Token2;Token3;Token4;Token5;;', ';' )

The beginning ";;;" of the string should return 3 blank rows, however, the function returns 2.

davecason SSC-Addicted Points: 411 More actions · Answer 8

I ran the XML, CTE, and the SUBSTRING functions through and checked the estimated to actual execution plans. The estimated shows the XML_Reader is, by far, the most costly (so costly it can't compare). The Substring shows to be twice as expensive as the CTE, but CTE only works (naturally) up to 100 tokens. The actual came out as equal, so we'd have to design something to check the execution at scale... which eliminates the CTE version unless the test is designed using parallel effort rather sheer token volume.

You could design a SUBSTRING/CTE hybrid where CTE is executed with a try/catch, then cut to the SUBSTRING if it fails by volume... but it might be simply smarter to use the version that fits your use case the best.

The SUBSTRING version has a bug where if the delimiter is a space or any phrase with a trailing space, it trims off that space (or ignores it) and comes up one character short. For VARCHAR, you can fix this by using the DATALENGTH() function instead of LEN() and for NVARCHAR, use DATALENGTH(), then divide by two.

I also found a bug in the XML version that used a clustered index on the returned table. That will change the token order and require unique tokens. You can remove the index to avoid natural reordering by the indexer and errors raised by duplicate tokens. Avoiding a tablescan here may or may not make sense for your use case.

I also forgot to mention the TALLY version. It uses CTE, too, and it was slightly more expensive than the SUBSTRING version so I didn't bother comparisons past that point because it had the worst of both CTE and SUBSTRING versions.

One more bug: the XML version does not handle special characters such as the ampersand(&amp). The use of a CDATA declaration may help but I did not try it.

davecason SSC-Addicted Points: 411 More actions · Answer 9

CTE version, made more like the SUBSTRING version:

CREATE FUNCTION [dbo].[fnStringSplit_CTE]

(

@SourceString VARCHAR(MAX),

@Delim CHAR(1)

)

RETURNS @Values TABLE

(

VALUE VARCHAR(MAX)

)

AS

BEGIN

with split(i, token, remainder) as

(select 1

, left(@SourceString,charindex(@delim,@SourceString)-1)

, LTRIM(right(@SourceString,len(@SourceString)-CHARINDEX(@delim,@SourceString)))

union all

select i + 1

,case when charindex(@delim,remainder) > 0 then

left(remainder,charindex(@delim,remainder)-1)

else remainder end as token

,LTRIM(right(remainder,len(remainder)-CHARINDEX(@Delim,remainder))) as remainder

from split

where charindex(@delim,remainder) >= 0 and token != remainder

)

insert into @Values

Select token

from split

RETURN

END

GO

nhaberl SSC Veteran Points: 261 More actions · Answer 10

Hi,

@davecason

Your function does only return 2 rows if there is a string like that given: 'bla;bla;bla'. And if a string like this is given 'bla;bla;bla;' 4 rows are returned.

Could you correct that?

regards

Norbert

Irfan.Baig SSC-Addicted Points: 420 More actions · Answer 11

Dear in your first scenario it being returned 3 rows and in second scenario that will be returned 4 rows according to delimiter counting e.g. ‘bla;bla;bla;’ the last row will be NULL. You can handle it in your code where you want to use this function.

davecason SSC-Addicted Points: 411 More actions · Answer 12

Norbert, try to fix it the way you want it, and post your code if you get stuck. I intentionally attempted to keep the last token if the delimiter was followed by nothing.

Gatekeeper SSCommitted Points: 1633 More actions · Answer 13

We've found good use of the following tally table and UDF. It handles empty elements and works fast for us. We've done large and small sets against it.

CREATE TABLE dbo.Tally

(N INT)

-- Default Data

-- Taken from the following website:

-- http://www.sqlservercentral.com/articles/TSQL/62867/

--=============================================================================

-- Create and populate a Tally table

--=============================================================================

--===== Conditionally drop

IF OBJECT_ID('dbo.Tally') IS NOT NULL

DROP TABLE dbo.Tally

--===== Create and populate the Tally table on the fly

SELECT TOP 3000000 --equates to more than 30 years of dates

IDENTITY(INT,1,1) AS N

INTO dbo.Tally

FROM Master.dbo.SysColumns sc1,

Master.dbo.SysColumns sc2

--===== Add a Primary Key to maximize performance

ALTER TABLE dbo.Tally

ADD CONSTRAINT PK_Tally_N

PRIMARY KEY CLUSTERED (N) WITH FILLFACTOR = 100

--===== Let the public use it

GRANT SELECT, REFERENCES ON dbo.Tally TO PUBLIC

GO

CREATE FUNCTION udf_StrList2Table (

@List NVARCHAR(MAX) = N'',

@Delimiter NCHAR(1) = N','

)

RETURNS TABLE

AS

RETURN

SELECT SUBSTRING( @Delimiter + @List + @Delimiter, n + 1,

CHARINDEX( @Delimiter, @Delimiter + @List + @Delimiter, n + 1 ) - n - 1 ) AS Value,

ROW_NUMBER() OVER ( ORDER BY n ) AS ListPos

FROM Tally

WHERE SUBSTRING( @Delimiter + @List + @Delimiter, n, 1 ) = @Delimiter

AND n < CAST(LEN( @Delimiter + @List + @Delimiter ) as int)

/* Anything is possible but is it worth it? */

davecason SSC-Addicted Points: 411 More actions · Answer 14

The tally table method seems to work fine, but the estimated and actual execution plans show a pretty severe penalty. My guess is that the initial file I/O and memory allocation of the table's index are at fault for those costs. I then derived a little test on about 17,000 tokens using the initial method, the XML method, and the tally table. With that said, the tally table won by a long shot when parsing a large array:

declare @i int = 26, @x varchar(max) = '', @d char(1) = ' ', @j-2 int;

declare @t table (id tinyint primary key, tokens int, which varchar(32), start datetime, finish datetime);

--Note: this is going to take a while, so if you want to run this more than once, store this data somewhere...

set @j-2 = @i*@i;

while @j-2 > 0 BEGIN

while @i > 0 BEGIN

set @x = @x + @d +CHAR(91 - @i);

set @i = @i - 1;

END

set @j-2 = @j-2 - 1;

set @i = 26

END

declare @C int;

insert into @t (id,which,start) values (1,'fnStringSplit_2005_Distinct_CHAR',getdate());

select @C = COUNT(*) from ..[fnStringSplit_2005_Distinct_CHAR](@x,@d);

update @t set tokens = @C, finish = GETDATE() where id = 1;

insert into @t (id,which,start) values (2,'udf_StrList2Table',getdate());

select @C = COUNT(*) from ..[udf_StrList2Table] (@x,@d)

update @t set tokens = @C, finish = GETDATE() where id = 2;

insert into @t (id,which,start) values (3,'fn_Split',getdate());

select @C = COUNT(*) from ..[fn_Split](@x,@d)

update @t set tokens = @C, finish = GETDATE() where id = 3;

select *, CONVERT(float, finish) - convert(float,start) as runtime from @t;

The results show us that the clear winner as the tally table. Volume seems to be a very large consideration in which method you choose. It looks like arrays of enormous scale should be parsed with a tally table method:

idtokenswhichstartfinishruntime

217577udf_StrList2Table54:52.054:52.00.00000057870056480169

317577fn_Split54:52.055:04.00.00013854166900273400

117577fnStringSplit_2005_Distinct_CHAR51:26.954:52.00.00237364969507325000

I added back my CTE function and reran it with a limit of 100 tokens (CTE max):

declare @i int = 26, @x varchar(max) = '', @d char(1) = ' ', @j-2 int;

declare @t table (id tinyint primary key, tokens int, which varchar(32), start datetime, finish datetime);

set @j-2 = @i;

while @j-2 > 0 BEGIN

while @i > 0 BEGIN

set @x = @x + @d +CHAR(91 - @i);

set @i = @i - 1;

END

set @j-2 = @j-2 - 1;

set @i = 26

END

set @x = LEFT(@x,100)

declare @C int;

insert into @t (id,which,start) values (0,'fnStringSplit_CTE',getdate());

select @C = COUNT(*) from ..[fnStringSplit_CTE](@x,@d)

update @t set tokens = @C, finish = GETDATE() where id = 0;

insert into @t (id,which,start) values (1,'fnStringSplit_2005_Distinct_CHAR',getdate());

select @C = COUNT(*) from ..[fnStringSplit_2005_Distinct_CHAR](@x,@d);

update @t set tokens = @C, finish = GETDATE() where id = 1;

insert into @t (id,which,start) values (2,'udf_StrList2Table',getdate());

select @C = COUNT(*) from ..[udf_StrList2Table] (@x,@d)

update @t set tokens = @C, finish = GETDATE() where id = 2;

insert into @t (id,which,start) values (3,'fn_Split',getdate());

select @C = COUNT(*) from ..[fn_Split](@x,@d)

update @t set tokens = @C, finish = GETDATE() where id = 3;

select *, CONVERT(float, finish) - convert(float,start) as runtime from @t;

The results show CTE is the winner, but the others are fairly close with the tally table last:

idtokenswhichstartfinishruntime

051fnStringSplit_CTE10:29.710:29.70.00000003858440322801

151fnStringSplit_2005_Distinct_CHAR10:29.710:29.70.00000027006171876565

351fn_Split10:29.710:29.80.00000027006171876565

251udf_StrList2Table10:29.710:29.70.00000038580037653446

The funny part about this one is that if I ran it a dozen times, only the tally table's result changed dramatically when splitting a small array of values (it sometimes took a lot longer... randomly).

So what I take from this is CTE is best for small arrays, and the tally table is best for large ones. The one last test I did not try is to run several thousand small splits at volume (may about 200 sets of 100 would do it). The only way I could think of to mock this up is to make a highly parallel package call via SSIS (basically, the same function called 2000 times at the same time). If I feel inspired, I'll give it a shot. Any suggestions on an easier parallelization test are welcome.