### http://www.sqlservercentral.com/blogs/philfactor/2008/10/09/validating-text-data_3A00_-a-few-techniques/

Printed 2014/04/16 10:53PM

## Validating text data. A few techniques.

### By Phil Factor, 2008/10/09

/*datafile used in the SQL  */

DECLARE @String VARCHAR(MAX)
SELECT  @String = 'When you are starting out to validate text input, perhaps to clean up a database, you will, at some point need to get down to the character level. Doing a simple character frequency of a string is pretty easy. all you need to do is to do is this   '
SELECT [frequency] = COUNT(*),

[character] = '''' + CONVERT(CHAR(1), SUBSTRING(@String, number, 1)) + ''''

FROM   numbers

WHERE  number < LEN(@string + '|')

GROUP BY SUBSTRING(@String, number, 1)

ORDER BY COUNT(*) DESC
/*
frequency   character
----------- ---------
23          ' '
8           'e'
7           't'
7           's'
7           'i'
7           'o'
6           'a'
...            ...
...            ...
*/

--what about the ratio of vowels to consonants? We need to exclude white-space for this and we'll include Y as a vowel.
SELECT [vowel ratio] = CONVERT(NUMERIC(16, 2),

SUM(CASE

WHEN SUBSTRING(@String, number, 1)

IN ('a', 'e', 'i', 'o', 'u', 'y')

THEN 1

ELSE 0

END) * 100.00 / COUNT(*))
FROM  numbers
WHERE number < LEN(@string + '|')
AND
SUBSTRING(@String, number, 1) LIKE ('[A-Za-z]')

/* OK we can now test this with the entire text of Moby Dick. This will give us a reasonable figure for what one might expect from English*/
DECLARE @LotsOfText VARCHAR(MAX)
SELECT @LotsOfText = BulkColumn
--  SQL Server 2005/8 only for this code)
FROM     OPENROWSET(BULK 'C:\workbench\moby-dick.txt', SINGLE_BLOB) AS x
SELECT   [vowel ratio] = CONVERT(NUMERIC(16, 2),

SUM(CASE

WHEN SUBSTRING(@LotsOfText, number, 1)

IN ('a', 'e', 'i', 'o', 'u', 'y')

THEN 1

ELSE 0

END) * 100.00 / COUNT(*))
FROM     numbers
WHERE    number < LEN(@LotsOfText + '|')
AND
SUBSTRING(@LotsOfText, number, 1) LIKE ('[A-Za-z]')
--vowel ratio
-------------
--39.20

/* Ok, with this tool, we have something that we can use to flag up text entry which is anomalous. What about people's names, though. This could be a lot trickier. After all. Dwight Ng is a valid name. That has a ratio of 12.5%*/

DROP TABLE PeoplesNames
CREATE TABLE PeoplesNames

(

firstname VARCHAR(50) NOT NULL,

lastname VARCHAR(50) NOT NULL,

[vowels] INT,

[characters] INT,

[Ratio] INT

)

--we'll import a million names in here just to test things out.

/*Now we add the vowel count, character count and then calculate the ratios*/
UPDATE PeoplesNames
SET vowels = (SELECT  COUNT(*)

FROM    numbers

WHERE   number < LEN(Firstname + ' ' + lastname + '|')
AND
SUBSTRING(Firstname + ' ' + lastname, number, 1)

IN ('a', 'e', 'i', 'o', 'u', 'y')
),

Characters = (SELECT COUNT(*)

FROM   numbers

WHERE  number < LEN(Firstname + ' ' + lastname + '|')
AND
SUBSTRING(Firstname + ' ' + lastname, number, 1)
LIKE (
'[A-Za-z]')
)

UPDATE   PeoplesNames
SET      ratio = vowels * 100.00 / CHARacters

/*
Dwight Schwartz--14%
Dwight Mc Knight--14%
Aimee Yu--85%

So now we can suspect any name less than 10% or more than 90% vowel. However, if you have stored the vowel ratios, then you can set your criteria for inspection as sensitive as you need without making the lights dim every time you do a query.

But this is a bit crude....*/

DECLARE @String VARCHAR(MAX)
SELECT  @String = 'Another approach is to look at the frequency range of the two-letter combinations used in your text. You will find that there are only a  limited number that are used in English'
SELECT [frequency] = COUNT(*),

[biChar] = '''' + CONVERT(CHAR(2), SUBSTRING(@String, number, 2)) + ''''

FROM   numbers

WHERE  number < LEN(@string + '|')
AND
SUBSTRING(@String, number, 2) LIKE '[A-Za-z][A-Za-z]'

GROUP BY SUBSTRING(@String, number, 2)

ORDER BY COUNT(*) DESC

/* OK we can now test this with the entire text of Moby Dick. This will give us a reasonable frequency analysis for English*/
DECLARE @LotsOfText VARCHAR(MAX)
SELECT @LotsOfText = BulkColumn
--  SQL Server 2005/8 only for this code)
FROM     OPENROWSET(BULK 'C:\workbench\moby-dick.txt', SINGLE_BLOB) AS x
SELECT [frequency] = COUNT(*),

[biChar] = '''' + CONVERT(CHAR(2), SUBSTRING(@LotsOfText, number, 2)) + ''''

FROM   numbers

WHERE  number < LEN(@LotsOfText + '|')
AND
SUBSTRING(@LotsOfText, number, 2) LIKE '[A-Za-z][A-Za-z]'

GROUP BY SUBSTRING(@LotsOfText, number, 2)

ORDER BY COUNT(*) DESC
--490
/*if you run this, you'll see that only around a third of the possible two-letter combinations are used. Here it gets fun. You can now score your strings to see how closely they conform to the standard frequency for two-letter combinations. You could just create an index that deducts a point every time an unusual two-character combination is found in text. Hmmm. I think I've provided enough code for one morning! At this point it is worth experimenting to see what best fits your particular application */